diff options
Diffstat (limited to 'test/CodeGen')
90 files changed, 6662 insertions, 6661 deletions
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll index 86ac5507a407..111aaf88b160 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll +++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll @@ -135,7 +135,7 @@ continue:  }  ; Check that we fallback on invoke translation failures. -; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %0:_(s128) = G_FCONSTANT quad 2 +; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %0:_(s128) = G_FCONSTANT fp128 0xL00000000000000004000000000000000  ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for test_quad_dump  ; FALLBACK-WITH-REPORT-OUT-LABEL: test_quad_dump:  define fp128 @test_quad_dump() { diff --git a/test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir b/test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir new file mode 100644 index 000000000000..47fda8f998d7 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/fp128-legalize-crash-pr35690.mir @@ -0,0 +1,44 @@ +# RUN: llc -O0 -run-pass=legalizer -global-isel -global-isel-abort=0 %s -o - | FileCheck %s +--- | +  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +  target triple = "aarch64" +   +  define fp128 @x(fp128 %a) { +  entry: +    %a.addr = alloca fp128, align 16 +    store fp128 %a, fp128* %a.addr, align 16 +    %0 = load fp128, fp128* %a.addr, align 16 +    %sub = fsub fp128 0xL00000000000000008000000000000000, %0 +    ret fp128 %sub +  } +   +... +--- +name:            x +alignment:       2 +exposesReturnsTwice: false +legalized:       false +regBankSelected: false +selected:        false +tracksRegLiveness: true +fixedStack:       +stack:            +  - { id: 0, name: a.addr, type: default, offset: 0, size: 16, alignment: 16,  +      stack-id: 0, callee-saved-register: '', callee-saved-restored: true,  +      di-variable: '', di-expression: '', di-location: '' } +body:             | +  bb.1.entry: +    liveins: %q0 + +    ; This test just checks we don't crash on G_FNEG of FP128 types. Expect to fall +    ; back until support is added for fp128. +    ; CHECK: ret +    %0:_(s128) = COPY %q0 +    %1:_(p0) = G_FRAME_INDEX %stack.0.a.addr +    G_STORE %0(s128), %1(p0) :: (store 16 into %ir.a.addr) +    %2:_(s128) = G_LOAD %1(p0) :: (load 16 from %ir.a.addr) +    %3:_(s128) = G_FNEG %2 +    %q0 = COPY %3(s128) +    RET_ReallyLR implicit %q0 + +... diff --git a/test/CodeGen/AArch64/GlobalISel/translate-gep.ll b/test/CodeGen/AArch64/GlobalISel/translate-gep.ll index 865315bbe0a3..4b69575079a3 100644 --- a/test/CodeGen/AArch64/GlobalISel/translate-gep.ll +++ b/test/CodeGen/AArch64/GlobalISel/translate-gep.ll @@ -1,85 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py  ; RUN: llc -mtriple=aarch64-linux-gnu -O0 -global-isel -stop-after=irtranslator -o - %s | FileCheck %s  %type = type [4 x {i8, i32}]  define %type* @first_offset_const(%type* %addr) { -; CHECK-LABEL: name: first_offset_const -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 -; CHECK: [[RES:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET]](s64) -; CHECK: %x0 = COPY [[RES]](p0) +  ; CHECK-LABEL: name: first_offset_const +  ; CHECK: bb.1 (%ir-block.0): +  ; CHECK:   liveins: %x0 +  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY %x0 +  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 +  ; CHECK:   [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) +  ; CHECK:   %x0 = COPY [[GEP]](p0) +  ; CHECK:   RET_ReallyLR implicit %x0    %res = getelementptr %type, %type* %addr, i32 1    ret %type* %res  }  define %type* @first_offset_trivial(%type* %addr) { -; CHECK-LABEL: name: first_offset_trivial -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[TRIVIAL:%[0-9]+]]:_(p0) = COPY [[BASE]](p0) -; CHECK: %x0 = COPY [[TRIVIAL]](p0) +  ; CHECK-LABEL: name: first_offset_trivial +  ; CHECK: bb.1 (%ir-block.0): +  ; CHECK:   liveins: %x0 +  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY %x0 +  ; CHECK:   [[COPY1:%[0-9]+]]:_(p0) = COPY [[COPY]](p0) +  ; CHECK:   %x0 = COPY [[COPY1]](p0) +  ; CHECK:   RET_ReallyLR implicit %x0    %res = getelementptr %type, %type* %addr, i32 0    ret %type* %res  }  define %type* @first_offset_variable(%type* %addr, i64 %idx) { -; CHECK-LABEL: name: first_offset_variable -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[IDX:%[0-9]+]]:_(s64) = COPY %x1 -; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 -; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX]] -; CHECK: [[STEP0:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET]](s64) -; CHECK: [[RES:%[0-9]+]]:_(p0) = COPY [[STEP0]](p0) -; CHECK: %x0 = COPY [[RES]](p0) +  ; CHECK-LABEL: name: first_offset_variable +  ; CHECK: bb.1 (%ir-block.0): +  ; CHECK:   liveins: %x0, %x1 +  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY %x0 +  ; CHECK:   [[COPY1:%[0-9]+]]:_(s64) = COPY %x1 +  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 +  ; CHECK:   [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C]], [[COPY1]] +  ; CHECK:   [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[MUL]](s64) +  ; CHECK:   [[COPY2:%[0-9]+]]:_(p0) = COPY [[GEP]](p0) +  ; CHECK:   %x0 = COPY [[COPY2]](p0) +  ; CHECK:   RET_ReallyLR implicit %x0    %res = getelementptr %type, %type* %addr, i64 %idx    ret %type* %res  }  define %type* @first_offset_ext(%type* %addr, i32 %idx) { -; CHECK-LABEL: name: first_offset_ext -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[IDX32:%[0-9]+]]:_(s32) = COPY %w1 -; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 -; CHECK: [[IDX64:%[0-9]+]]:_(s64) = G_SEXT [[IDX32]](s32) -; CHECK: [[OFFSET:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX64]] -; CHECK: [[STEP0:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET]](s64) -; CHECK: [[RES:%[0-9]+]]:_(p0) = COPY [[STEP0]](p0) -; CHECK: %x0 = COPY [[RES]](p0) +  ; CHECK-LABEL: name: first_offset_ext +  ; CHECK: bb.1 (%ir-block.0): +  ; CHECK:   liveins: %w1, %x0 +  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY %x0 +  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY %w1 +  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 +  ; CHECK:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY1]](s32) +  ; CHECK:   [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C]], [[SEXT]] +  ; CHECK:   [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[MUL]](s64) +  ; CHECK:   [[COPY2:%[0-9]+]]:_(p0) = COPY [[GEP]](p0) +  ; CHECK:   %x0 = COPY [[COPY2]](p0) +  ; CHECK:   RET_ReallyLR implicit %x0    %res = getelementptr %type, %type* %addr, i32 %idx    ret %type* %res  }  %type1 = type [4 x [4 x i32]]  define i32* @const_then_var(%type1* %addr, i64 %idx) { -; CHECK-LABEL: name: const_then_var -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[IDX:%[0-9]+]]:_(s64) = COPY %x1 -; CHECK: [[OFFSET1:%[0-9]+]]:_(s64) = G_CONSTANT i64 272 -; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 -; CHECK: [[BASE1:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET1]](s64) -; CHECK: [[OFFSET2:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX]] -; CHECK: [[BASE2:%[0-9]+]]:_(p0) = G_GEP [[BASE1]], [[OFFSET2]](s64) -; CHECK: [[RES:%[0-9]+]]:_(p0) = COPY [[BASE2]](p0) -; CHECK: %x0 = COPY [[RES]](p0) +  ; CHECK-LABEL: name: const_then_var +  ; CHECK: bb.1 (%ir-block.0): +  ; CHECK:   liveins: %x0, %x1 +  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY %x0 +  ; CHECK:   [[COPY1:%[0-9]+]]:_(s64) = COPY %x1 +  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 272 +  ; CHECK:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 +  ; CHECK:   [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) +  ; CHECK:   [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C1]], [[COPY1]] +  ; CHECK:   [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[GEP]], [[MUL]](s64) +  ; CHECK:   [[COPY2:%[0-9]+]]:_(p0) = COPY [[GEP1]](p0) +  ; CHECK:   %x0 = COPY [[COPY2]](p0) +  ; CHECK:   RET_ReallyLR implicit %x0    %res = getelementptr %type1, %type1* %addr, i32 4, i32 1, i64 %idx    ret i32* %res  }  define i32* @var_then_const(%type1* %addr, i64 %idx) { -; CHECK-LABEL: name: var_then_const -; CHECK: [[BASE:%[0-9]+]]:_(p0) = COPY %x0 -; CHECK: [[IDX:%[0-9]+]]:_(s64) = COPY %x1 -; CHECK: [[SIZE:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 -; CHECK: [[OFFSET2:%[0-9]+]]:_(s64) = G_CONSTANT i64 40 -; CHECK: [[OFFSET1:%[0-9]+]]:_(s64) = G_MUL [[SIZE]], [[IDX]] -; CHECK: [[BASE1:%[0-9]+]]:_(p0) = G_GEP [[BASE]], [[OFFSET1]](s64) -; CHECK: [[BASE2:%[0-9]+]]:_(p0) = G_GEP [[BASE1]], [[OFFSET2]](s64) -; CHECK: %x0 = COPY [[BASE2]](p0) +  ; CHECK-LABEL: name: var_then_const +  ; CHECK: bb.1 (%ir-block.0): +  ; CHECK:   liveins: %x0, %x1 +  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY %x0 +  ; CHECK:   [[COPY1:%[0-9]+]]:_(s64) = COPY %x1 +  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 +  ; CHECK:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 40 +  ; CHECK:   [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C]], [[COPY1]] +  ; CHECK:   [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[MUL]](s64) +  ; CHECK:   [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[GEP]], [[C1]](s64) +  ; CHECK:   %x0 = COPY [[GEP1]](p0) +  ; CHECK:   RET_ReallyLR implicit %x0    %res = getelementptr %type1, %type1* %addr, i64 %idx, i32 2, i32 2    ret i32* %res  } diff --git a/test/CodeGen/AArch64/arm64-jumptable.ll b/test/CodeGen/AArch64/arm64-jumptable.ll index f5c2ee6da0bf..fac3e5704d15 100644 --- a/test/CodeGen/AArch64/arm64-jumptable.ll +++ b/test/CodeGen/AArch64/arm64-jumptable.ll @@ -6,22 +6,20 @@ define void @sum(i32 %a, i32* %to, i32 %c) {  entry:    switch i32 %a, label %exit [      i32 1, label %bb1 -    i32 2, label %bb2 +    i32 2, label %exit.sink.split      i32 3, label %bb3      i32 4, label %bb4    ]  bb1:    %b = add i32 %c, 1 -  store i32 %b, i32* %to -  br label %exit -bb2: -  store i32 2, i32* %to -  br label %exit +  br label %exit.sink.split  bb3: -  store i32 3, i32* %to -  br label %exit +  br label %exit.sink.split  bb4: -  store i32 5, i32* %to +  br label %exit.sink.split +exit.sink.split: +  %.sink = phi i32 [ 5, %bb4 ], [ %b, %bb1 ], [ 3, %bb3 ], [ %a, %entry ] +  store i32 %.sink, i32* %to    br label %exit  exit:    ret void diff --git a/test/CodeGen/AArch64/arm64-memset-to-bzero.ll b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll index 29036caabf3a..3466e1bace56 100644 --- a/test/CodeGen/AArch64/arm64-memset-to-bzero.ll +++ b/test/CodeGen/AArch64/arm64-memset-to-bzero.ll @@ -4,9 +4,10 @@  ; RUN:   FileCheck --check-prefix=CHECK-LINUX --check-prefix=CHECK %s  ; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset() -; CHECK: @fct1 +; CHECK-LABEL: fct1:  ; For small size (<= 256), we do not change memset to bzero. -; CHECK: memset +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset  define void @fct1(i8* nocapture %ptr) {  entry:    tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i32 1, i1 false) @@ -15,20 +16,20 @@ entry:  declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) -; CHECK: @fct2 +; CHECK-LABEL: fct2:  ; When the size is bigger than 256, change into bzero. -; CHECK-DARWIN: bzero -; CHECK-LINUX: memset +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset  define void @fct2(i8* nocapture %ptr) {  entry:    tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i32 1, i1 false)    ret void  } -; CHECK: @fct3 +; CHECK-LABEL: fct3:  ; For unknown size, change to bzero. -; CHECK-DARWIN: bzero -; CHECK-LINUX: memset +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset  define void @fct3(i8* nocapture %ptr, i32 %unknown) {  entry:    %conv = sext i32 %unknown to i64 @@ -36,9 +37,10 @@ entry:    ret void  } -; CHECK: @fct4 +; CHECK-LABEL: fct4:  ; Size <= 256, no change. -; CHECK: memset +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset  define void @fct4(i8* %ptr) {  entry:    %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) @@ -50,10 +52,10 @@ declare i8* @__memset_chk(i8*, i32, i64, i64)  declare i64 @llvm.objectsize.i64(i8*, i1) -; CHECK: @fct5 +; CHECK-LABEL: fct5:  ; Size > 256, change. -; CHECK-DARWIN: bzero -; CHECK-LINUX: memset +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset  define void @fct5(i8* %ptr) {  entry:    %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) @@ -61,10 +63,10 @@ entry:    ret void  } -; CHECK: @fct6 +; CHECK-LABEL: fct6:  ; Size = unknown, change. -; CHECK-DARWIN: bzero -; CHECK-LINUX: memset +; CHECK-DARWIN: {{b|bl}} _bzero +; CHECK-LINUX: {{b|bl}} memset  define void @fct6(i8* %ptr, i32 %unknown) {  entry:    %conv = sext i32 %unknown to i64 @@ -76,9 +78,10 @@ entry:  ; Next functions check that memset is not turned into bzero  ; when the set constant is non-zero, whatever the given size. -; CHECK: @fct7 +; CHECK-LABEL: fct7:  ; memset with something that is not a zero, no change. -; CHECK: memset +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset  define void @fct7(i8* %ptr) {  entry:    %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) @@ -86,9 +89,10 @@ entry:    ret void  } -; CHECK: @fct8 +; CHECK-LABEL: fct8:  ; memset with something that is not a zero, no change. -; CHECK: memset +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset  define void @fct8(i8* %ptr) {  entry:    %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) @@ -96,9 +100,10 @@ entry:    ret void  } -; CHECK: @fct9 +; CHECK-LABEL: fct9:  ; memset with something that is not a zero, no change. -; CHECK: memset +; CHECK-DARWIN: {{b|bl}} _memset +; CHECK-LINUX: {{b|bl}} memset  define void @fct9(i8* %ptr, i32 %unknown) {  entry:    %conv = sext i32 %unknown to i64 diff --git a/test/CodeGen/AArch64/arm64-neon-2velem.ll b/test/CodeGen/AArch64/arm64-neon-2velem.ll index d22bfc76d1d5..b3a2bcd5d669 100644 --- a/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ b/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s -; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck --check-prefix=EXYNOS %s +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast                 | FileCheck %s --check-prefixes=CHECK,GENERIC  ; The instruction latencies of Exynos-M1 trigger the transform we see under the Exynos check. +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast -mcpu=exynos-m1 | FileCheck %s --check-prefixes=CHECK,EXYNOSM1  declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) @@ -47,7 +47,6 @@ declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)  define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmla_lane_s16:  ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %mul = mul <4 x i16> %shuffle, %b @@ -58,7 +57,6 @@ entry:  define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlaq_lane_s16:  ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>    %mul = mul <8 x i16> %shuffle, %b @@ -69,7 +67,6 @@ entry:  define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmla_lane_s32:  ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>    %mul = mul <2 x i32> %shuffle, %b @@ -80,7 +77,6 @@ entry:  define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlaq_lane_s32:  ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>    %mul = mul <4 x i32> %shuffle, %b @@ -91,7 +87,6 @@ entry:  define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmla_laneq_s16:  ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>    %mul = mul <4 x i16> %shuffle, %b @@ -102,7 +97,6 @@ entry:  define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlaq_laneq_s16:  ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>    %mul = mul <8 x i16> %shuffle, %b @@ -113,7 +107,6 @@ entry:  define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmla_laneq_s32:  ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>    %mul = mul <2 x i32> %shuffle, %b @@ -124,7 +117,6 @@ entry:  define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlaq_laneq_s32:  ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %mul = mul <4 x i32> %shuffle, %b @@ -135,7 +127,6 @@ entry:  define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmls_lane_s16:  ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %mul = mul <4 x i16> %shuffle, %b @@ -146,7 +137,6 @@ entry:  define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlsq_lane_s16:  ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>    %mul = mul <8 x i16> %shuffle, %b @@ -157,7 +147,6 @@ entry:  define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmls_lane_s32:  ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>    %mul = mul <2 x i32> %shuffle, %b @@ -168,7 +157,6 @@ entry:  define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlsq_lane_s32:  ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>    %mul = mul <4 x i32> %shuffle, %b @@ -179,7 +167,6 @@ entry:  define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmls_laneq_s16:  ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>    %mul = mul <4 x i16> %shuffle, %b @@ -190,7 +177,6 @@ entry:  define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlsq_laneq_s16:  ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>    %mul = mul <8 x i16> %shuffle, %b @@ -201,7 +187,6 @@ entry:  define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmls_laneq_s32:  ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>    %mul = mul <2 x i32> %shuffle, %b @@ -212,7 +197,6 @@ entry:  define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlsq_laneq_s32:  ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %mul = mul <4 x i32> %shuffle, %b @@ -223,7 +207,6 @@ entry:  define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vmul_lane_s16:  ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %mul = mul <4 x i16> %shuffle, %a @@ -233,7 +216,6 @@ entry:  define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vmulq_lane_s16:  ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>    %mul = mul <8 x i16> %shuffle, %a @@ -243,7 +225,6 @@ entry:  define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vmul_lane_s32:  ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>    %mul = mul <2 x i32> %shuffle, %a @@ -253,7 +234,6 @@ entry:  define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vmulq_lane_s32:  ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>    %mul = mul <4 x i32> %shuffle, %a @@ -263,7 +243,6 @@ entry:  define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vmul_lane_u16:  ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %mul = mul <4 x i16> %shuffle, %a @@ -273,7 +252,6 @@ entry:  define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vmulq_lane_u16:  ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>    %mul = mul <8 x i16> %shuffle, %a @@ -283,7 +261,6 @@ entry:  define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vmul_lane_u32:  ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>    %mul = mul <2 x i32> %shuffle, %a @@ -293,7 +270,6 @@ entry:  define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vmulq_lane_u32:  ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>    %mul = mul <4 x i32> %shuffle, %a @@ -303,7 +279,6 @@ entry:  define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vmul_laneq_s16:  ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>    %mul = mul <4 x i16> %shuffle, %a @@ -313,7 +288,6 @@ entry:  define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vmulq_laneq_s16:  ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>    %mul = mul <8 x i16> %shuffle, %a @@ -323,7 +297,6 @@ entry:  define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vmul_laneq_s32:  ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>    %mul = mul <2 x i32> %shuffle, %a @@ -333,7 +306,6 @@ entry:  define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vmulq_laneq_s32:  ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %mul = mul <4 x i32> %shuffle, %a @@ -343,7 +315,6 @@ entry:  define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vmul_laneq_u16:  ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>    %mul = mul <4 x i16> %shuffle, %a @@ -353,7 +324,6 @@ entry:  define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vmulq_laneq_u16:  ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>    %mul = mul <8 x i16> %shuffle, %a @@ -363,7 +333,6 @@ entry:  define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vmul_laneq_u32:  ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>    %mul = mul <2 x i32> %shuffle, %a @@ -373,7 +342,6 @@ entry:  define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vmulq_laneq_u32:  ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %mul = mul <4 x i32> %shuffle, %a @@ -382,12 +350,9 @@ entry:  define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {  ; CHECK-LABEL: test_vfma_lane_f32: -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfma_lane_f32: -; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] -; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>    %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -398,12 +363,9 @@ declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)  define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {  ; CHECK-LABEL: test_vfmaq_lane_f32: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_lane_f32: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>    %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -414,12 +376,9 @@ declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)  define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {  ; CHECK-LABEL: test_vfma_laneq_f32: -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfma_laneq_f32: -; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] -; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>    %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -428,12 +387,9 @@ entry:  define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {  ; CHECK-LABEL: test_vfmaq_laneq_f32: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_laneq_f32: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -442,12 +398,9 @@ entry:  define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {  ; CHECK-LABEL: test_vfms_lane_f32: -; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfms_lane_f32: -; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] -; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v    %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> <i32 1, i32 1> @@ -457,12 +410,9 @@ entry:  define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) {  ; CHECK-LABEL: test_vfmsq_lane_f32: -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_lane_f32: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v    %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> @@ -472,12 +422,9 @@ entry:  define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {  ; CHECK-LABEL: test_vfms_laneq_f32: -; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfms_laneq_f32: -; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] -; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v    %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> <i32 3, i32 3> @@ -487,12 +434,9 @@ entry:  define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) {  ; CHECK-LABEL: test_vfmsq_laneq_f32: -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_laneq_f32: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v    %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -502,12 +446,9 @@ entry:  define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {  ; CHECK-LABEL: test_vfmaq_lane_f64: -; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_lane_f64: -; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d  entry:    %lane = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer    %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -518,12 +459,9 @@ declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)  define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {  ; CHECK-LABEL: test_vfmaq_laneq_f64: -; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_laneq_f64: -; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] -; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d  entry:    %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>    %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -532,12 +470,9 @@ entry:  define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) {  ; CHECK-LABEL: test_vfmsq_lane_f64: -; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_lane_f64: -; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d  entry:    %sub = fsub <1 x double> <double -0.000000e+00>, %v    %lane = shufflevector <1 x double> %sub, <1 x double> undef, <2 x i32> zeroinitializer @@ -547,12 +482,9 @@ entry:  define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) {  ; CHECK-LABEL: test_vfmsq_laneq_f64: -; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_laneq_f64: -; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] -; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d  entry:    %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v    %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> <i32 1, i32 1> @@ -563,10 +495,6 @@ entry:  define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) {  ; CHECK-LABEL: test_vfmas_laneq_f32  ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXNOS-LABEL: test_vfmas_laneq_f32 -; EXNOS: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] -; EXNOS-NEXT: ret  entry:    %extract = extractelement <4 x float> %v, i32 3    %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a) @@ -578,7 +506,6 @@ declare float @llvm.fma.f32(float, float, float)  define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) {  ; CHECK-LABEL: test_vfmsd_lane_f64  ; CHECK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} -; CHECK-NEXT: ret  entry:    %extract.rhs = extractelement <1 x double> %v, i32 0    %extract = fsub double -0.000000e+00, %extract.rhs @@ -591,10 +518,6 @@ declare double @llvm.fma.f64(double, double, double)  define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %v) {  ; CHECK-LABEL: test_vfmss_lane_f32  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmss_lane_f32 -; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] -; EXYNOS-NEXT: ret  entry:    %extract.rhs = extractelement <2 x float> %v, i32 1    %extract = fsub float -0.000000e+00, %extract.rhs @@ -605,7 +528,6 @@ entry:  define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {  ; CHECK-LABEL: test_vfmss_laneq_f32  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %extract.rhs = extractelement <4 x float> %v, i32 3    %extract = fsub float -0.000000e+00, %extract.rhs @@ -616,10 +538,6 @@ entry:  define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) {  ; CHECK-LABEL: test_vfmsd_laneq_f64  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsd_laneq_f64 -; EXYNOS: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] -; EXYNOS-NEXT: ret  entry:    %extract.rhs = extractelement <2 x double> %v, i32 1    %extract = fsub double -0.000000e+00, %extract.rhs @@ -641,10 +559,6 @@ entry:  define float @test_vfmss_lane_f32_0(float %a, float %b, <2 x float> %v) {  ; CHECK-LABEL: test_vfmss_lane_f32_0  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmss_lane_f32_0 -; EXYNOS: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1] -; EXYNOS-NEXT: ret  entry:    %tmp0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v    %tmp1 = extractelement <2 x float> %tmp0, i32 1 @@ -655,7 +569,6 @@ entry:  define float @test_vfmss_laneq_f32_0(float %a, float %b, <4 x float> %v) {  ; CHECK-LABEL: test_vfmss_laneq_f32_0  ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %tmp0 = fsub <4 x float><float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v    %tmp1 = extractelement <4 x float> %tmp0, i32 3 @@ -666,7 +579,6 @@ entry:  define double @test_vfmsd_laneq_f64_0(double %a, double %b, <2 x double> %v) {  ; CHECK-LABEL: test_vfmsd_laneq_f64_0  ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret  entry:    %tmp0 = fsub <2 x double><double -0.000000e+00, double -0.000000e+00>, %v    %tmp1 = extractelement <2 x double> %tmp0, i32 1 @@ -677,7 +589,6 @@ entry:  define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlal_lane_s16:  ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -688,7 +599,6 @@ entry:  define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlal_lane_s32:  ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -699,7 +609,6 @@ entry:  define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlal_laneq_s16:  ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -710,7 +619,6 @@ entry:  define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlal_laneq_s32:  ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -721,7 +629,6 @@ entry:  define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlal_high_lane_s16:  ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -733,7 +640,6 @@ entry:  define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlal_high_lane_s32:  ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -745,7 +651,6 @@ entry:  define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlal_high_laneq_s16:  ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> @@ -757,7 +662,6 @@ entry:  define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlal_high_laneq_s32:  ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> @@ -769,7 +673,6 @@ entry:  define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlsl_lane_s16:  ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -780,7 +683,6 @@ entry:  define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlsl_lane_s32:  ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -791,7 +693,6 @@ entry:  define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlsl_laneq_s16:  ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -802,7 +703,6 @@ entry:  define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlsl_laneq_s32:  ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -813,7 +713,6 @@ entry:  define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlsl_high_lane_s16:  ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -825,7 +724,6 @@ entry:  define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlsl_high_lane_s32:  ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -837,7 +735,6 @@ entry:  define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlsl_high_laneq_s16:  ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> @@ -849,7 +746,6 @@ entry:  define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlsl_high_laneq_s32:  ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> @@ -861,7 +757,6 @@ entry:  define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlal_lane_u16:  ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -872,7 +767,6 @@ entry:  define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlal_lane_u32:  ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -883,7 +777,6 @@ entry:  define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlal_laneq_u16:  ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -894,7 +787,6 @@ entry:  define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlal_laneq_u32:  ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -905,7 +797,6 @@ entry:  define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlal_high_lane_u16:  ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -917,7 +808,6 @@ entry:  define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlal_high_lane_u32:  ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -929,7 +819,6 @@ entry:  define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlal_high_laneq_u16:  ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> @@ -941,7 +830,6 @@ entry:  define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlal_high_laneq_u32:  ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> @@ -953,7 +841,6 @@ entry:  define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlsl_lane_u16:  ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -964,7 +851,6 @@ entry:  define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlsl_lane_u32:  ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -975,7 +861,6 @@ entry:  define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlsl_laneq_u16:  ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -986,7 +871,6 @@ entry:  define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlsl_laneq_u32:  ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -997,7 +881,6 @@ entry:  define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlsl_high_lane_u16:  ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -1009,7 +892,6 @@ entry:  define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlsl_high_lane_u32:  ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -1021,7 +903,6 @@ entry:  define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlsl_high_laneq_u16:  ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> @@ -1033,7 +914,6 @@ entry:  define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlsl_high_laneq_u32:  ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> @@ -1045,7 +925,6 @@ entry:  define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vmull_lane_s16:  ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1055,7 +934,6 @@ entry:  define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vmull_lane_s32:  ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1065,7 +943,6 @@ entry:  define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vmull_lane_u16:  ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1075,7 +952,6 @@ entry:  define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vmull_lane_u32:  ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1085,7 +961,6 @@ entry:  define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vmull_high_lane_s16:  ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -1096,7 +971,6 @@ entry:  define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vmull_high_lane_s32:  ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -1107,7 +981,6 @@ entry:  define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vmull_high_lane_u16:  ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -1118,7 +991,6 @@ entry:  define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vmull_high_lane_u32:  ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -1129,7 +1001,6 @@ entry:  define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vmull_laneq_s16:  ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1139,7 +1010,6 @@ entry:  define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vmull_laneq_s32:  ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1149,7 +1019,6 @@ entry:  define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vmull_laneq_u16:  ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1159,7 +1028,6 @@ entry:  define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vmull_laneq_u32:  ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1169,7 +1037,6 @@ entry:  define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vmull_high_laneq_s16:  ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> @@ -1180,7 +1047,6 @@ entry:  define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vmull_high_laneq_s32:  ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> @@ -1191,7 +1057,6 @@ entry:  define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vmull_high_laneq_u16:  ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> @@ -1202,7 +1067,6 @@ entry:  define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vmull_high_laneq_u32:  ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> @@ -1213,7 +1077,6 @@ entry:  define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vqdmlal_lane_s16:  ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -1224,7 +1087,6 @@ entry:  define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vqdmlal_lane_s32:  ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>    %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1235,7 +1097,6 @@ entry:  define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vqdmlal_high_lane_s16:  ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -1247,7 +1108,6 @@ entry:  define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vqdmlal_high_lane_s32:  ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -1259,7 +1119,6 @@ entry:  define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vqdmlsl_lane_s16:  ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -1270,7 +1129,6 @@ entry:  define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vqdmlsl_lane_s32:  ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>    %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1281,7 +1139,6 @@ entry:  define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vqdmlsl_high_lane_s16:  ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -1293,7 +1150,6 @@ entry:  define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vqdmlsl_high_lane_s32:  ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -1305,7 +1161,6 @@ entry:  define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vqdmull_lane_s16:  ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1315,7 +1170,6 @@ entry:  define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vqdmull_lane_s32:  ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>    %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1325,7 +1179,6 @@ entry:  define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vqdmull_laneq_s16:  ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1335,7 +1188,6 @@ entry:  define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vqdmull_laneq_s32:  ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>    %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1345,7 +1197,6 @@ entry:  define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vqdmull_high_lane_s16:  ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> @@ -1356,7 +1207,6 @@ entry:  define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vqdmull_high_lane_s32:  ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1> @@ -1367,7 +1217,6 @@ entry:  define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vqdmull_high_laneq_s16:  ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7> @@ -1378,7 +1227,6 @@ entry:  define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vqdmull_high_laneq_s32:  ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3> @@ -1389,7 +1237,6 @@ entry:  define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vqdmulh_lane_s16:  ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) @@ -1399,7 +1246,6 @@ entry:  define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vqdmulhq_lane_s16:  ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>    %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) @@ -1409,7 +1255,6 @@ entry:  define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vqdmulh_lane_s32:  ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>    %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) @@ -1419,7 +1264,6 @@ entry:  define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vqdmulhq_lane_s32:  ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>    %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) @@ -1429,7 +1273,6 @@ entry:  define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vqrdmulh_lane_s16:  ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) @@ -1439,7 +1282,6 @@ entry:  define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vqrdmulhq_lane_s16:  ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>    %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) @@ -1449,7 +1291,6 @@ entry:  define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vqrdmulh_lane_s32:  ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>    %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) @@ -1459,7 +1300,6 @@ entry:  define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vqrdmulhq_lane_s32:  ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>    %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) @@ -1468,12 +1308,9 @@ entry:  define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) {  ; CHECK-LABEL: test_vmul_lane_f32: -; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_lane_f32: -; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] -; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>    %mul = fmul <2 x float> %shuffle, %a @@ -1483,10 +1320,6 @@ entry:  define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) {  ; CHECK-LABEL: test_vmul_lane_f64:  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_lane_f64: -; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} -; EXYNOS-NEXT: ret  entry:    %0 = bitcast <1 x double> %a to <8 x i8>    %1 = bitcast <8 x i8> %0 to double @@ -1498,12 +1331,9 @@ entry:  define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) {  ; CHECK-LABEL: test_vmulq_lane_f32: -; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_lane_f32: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] -; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>    %mul = fmul <4 x float> %shuffle, %a @@ -1512,12 +1342,9 @@ entry:  define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) {  ; CHECK-LABEL: test_vmulq_lane_f64: -; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_lane_f64: -; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d  entry:    %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer    %mul = fmul <2 x double> %shuffle, %a @@ -1526,12 +1353,9 @@ entry:  define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {  ; CHECK-LABEL: test_vmul_laneq_f32: -; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_laneq_f32: -; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] -; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>    %mul = fmul <2 x float> %shuffle, %a @@ -1541,10 +1365,6 @@ entry:  define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) {  ; CHECK-LABEL: test_vmul_laneq_f64:  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_laneq_f64: -; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1] -; EXYNOS-NEXT: ret  entry:    %0 = bitcast <1 x double> %a to <8 x i8>    %1 = bitcast <8 x i8> %0 to double @@ -1556,12 +1376,9 @@ entry:  define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) {  ; CHECK-LABEL: test_vmulq_laneq_f32: -; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_laneq_f32: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %mul = fmul <4 x float> %shuffle, %a @@ -1570,12 +1387,9 @@ entry:  define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) {  ; CHECK-LABEL: test_vmulq_laneq_f64: -; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_laneq_f64: -; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] -; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d  entry:    %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>    %mul = fmul <2 x double> %shuffle, %a @@ -1584,12 +1398,9 @@ entry:  define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) {  ; CHECK-LABEL: test_vmulx_lane_f32: -; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulx_lane_f32: -; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] -; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>    %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -1598,12 +1409,9 @@ entry:  define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) {  ; CHECK-LABEL: test_vmulxq_lane_f32: -; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_lane_f32: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[1] -; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; Exynos-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>    %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -1612,12 +1420,9 @@ entry:  define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) {  ; CHECK-LABEL: test_vmulxq_lane_f64: -; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_lane_f64: -; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d  entry:    %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer    %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -1626,12 +1431,9 @@ entry:  define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {  ; CHECK-LABEL: test_vmulx_laneq_f32: -; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulx_laneq_f32: -; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[3] -; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[3] +; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>    %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -1640,12 +1442,9 @@ entry:  define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) {  ; CHECK-LABEL: test_vmulxq_laneq_f32: -; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_laneq_f32: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -1654,12 +1453,9 @@ entry:  define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) {  ; CHECK-LABEL: test_vmulxq_laneq_f64: -; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_laneq_f64: -; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[1] -; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[1] +; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d  entry:    %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>    %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -1669,7 +1465,6 @@ entry:  define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmla_lane_s16_0:  ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer    %mul = mul <4 x i16> %shuffle, %b @@ -1680,7 +1475,6 @@ entry:  define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlaq_lane_s16_0:  ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer    %mul = mul <8 x i16> %shuffle, %b @@ -1691,7 +1485,6 @@ entry:  define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmla_lane_s32_0:  ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer    %mul = mul <2 x i32> %shuffle, %b @@ -1702,7 +1495,6 @@ entry:  define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlaq_lane_s32_0:  ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer    %mul = mul <4 x i32> %shuffle, %b @@ -1713,7 +1505,6 @@ entry:  define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmla_laneq_s16_0:  ; CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer    %mul = mul <4 x i16> %shuffle, %b @@ -1724,7 +1515,6 @@ entry:  define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlaq_laneq_s16_0:  ; CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer    %mul = mul <8 x i16> %shuffle, %b @@ -1735,7 +1525,6 @@ entry:  define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmla_laneq_s32_0:  ; CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer    %mul = mul <2 x i32> %shuffle, %b @@ -1746,7 +1535,6 @@ entry:  define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlaq_laneq_s32_0:  ; CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer    %mul = mul <4 x i32> %shuffle, %b @@ -1757,7 +1545,6 @@ entry:  define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmls_lane_s16_0:  ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer    %mul = mul <4 x i16> %shuffle, %b @@ -1768,7 +1555,6 @@ entry:  define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlsq_lane_s16_0:  ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer    %mul = mul <8 x i16> %shuffle, %b @@ -1779,7 +1565,6 @@ entry:  define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmls_lane_s32_0:  ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer    %mul = mul <2 x i32> %shuffle, %b @@ -1790,7 +1575,6 @@ entry:  define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlsq_lane_s32_0:  ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer    %mul = mul <4 x i32> %shuffle, %b @@ -1801,7 +1585,6 @@ entry:  define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmls_laneq_s16_0:  ; CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer    %mul = mul <4 x i16> %shuffle, %b @@ -1812,7 +1595,6 @@ entry:  define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlsq_laneq_s16_0:  ; CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer    %mul = mul <8 x i16> %shuffle, %b @@ -1823,7 +1605,6 @@ entry:  define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmls_laneq_s32_0:  ; CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer    %mul = mul <2 x i32> %shuffle, %b @@ -1834,7 +1615,6 @@ entry:  define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlsq_laneq_s32_0:  ; CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer    %mul = mul <4 x i32> %shuffle, %b @@ -1845,7 +1625,6 @@ entry:  define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vmul_lane_s16_0:  ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer    %mul = mul <4 x i16> %shuffle, %a @@ -1855,7 +1634,6 @@ entry:  define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vmulq_lane_s16_0:  ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer    %mul = mul <8 x i16> %shuffle, %a @@ -1865,7 +1643,6 @@ entry:  define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vmul_lane_s32_0:  ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer    %mul = mul <2 x i32> %shuffle, %a @@ -1875,7 +1652,6 @@ entry:  define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vmulq_lane_s32_0:  ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer    %mul = mul <4 x i32> %shuffle, %a @@ -1885,7 +1661,6 @@ entry:  define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vmul_lane_u16_0:  ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer    %mul = mul <4 x i16> %shuffle, %a @@ -1895,7 +1670,6 @@ entry:  define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vmulq_lane_u16_0:  ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer    %mul = mul <8 x i16> %shuffle, %a @@ -1905,7 +1679,6 @@ entry:  define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vmul_lane_u32_0:  ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer    %mul = mul <2 x i32> %shuffle, %a @@ -1915,7 +1688,6 @@ entry:  define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vmulq_lane_u32_0:  ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer    %mul = mul <4 x i32> %shuffle, %a @@ -1925,7 +1697,6 @@ entry:  define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vmul_laneq_s16_0:  ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer    %mul = mul <4 x i16> %shuffle, %a @@ -1935,7 +1706,6 @@ entry:  define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vmulq_laneq_s16_0:  ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer    %mul = mul <8 x i16> %shuffle, %a @@ -1945,7 +1715,6 @@ entry:  define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vmul_laneq_s32_0:  ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer    %mul = mul <2 x i32> %shuffle, %a @@ -1955,7 +1724,6 @@ entry:  define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vmulq_laneq_s32_0:  ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer    %mul = mul <4 x i32> %shuffle, %a @@ -1965,7 +1733,6 @@ entry:  define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vmul_laneq_u16_0:  ; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer    %mul = mul <4 x i16> %shuffle, %a @@ -1975,7 +1742,6 @@ entry:  define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vmulq_laneq_u16_0:  ; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer    %mul = mul <8 x i16> %shuffle, %a @@ -1985,7 +1751,6 @@ entry:  define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vmul_laneq_u32_0:  ; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer    %mul = mul <2 x i32> %shuffle, %a @@ -1995,7 +1760,6 @@ entry:  define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vmulq_laneq_u32_0:  ; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer    %mul = mul <4 x i32> %shuffle, %a @@ -2004,12 +1768,9 @@ entry:  define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {  ; CHECK-LABEL: test_vfma_lane_f32_0: -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfma_lane_f32_0: -; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer    %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -2018,12 +1779,9 @@ entry:  define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {  ; CHECK-LABEL: test_vfmaq_lane_f32_0: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_lane_f32_0: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %lane = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer    %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -2032,12 +1790,9 @@ entry:  define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {  ; CHECK-LABEL: test_vfma_laneq_f32_0: -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfma_laneq_f32_0: -; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer    %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -2046,12 +1801,9 @@ entry:  define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {  ; CHECK-LABEL: test_vfmaq_laneq_f32_0: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_laneq_f32_0: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %lane = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer    %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane, <4 x float> %b, <4 x float> %a) @@ -2060,12 +1812,9 @@ entry:  define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) {  ; CHECK-LABEL: test_vfms_lane_f32_0: -; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfms_lane_f32_0: -; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v    %lane = shufflevector <2 x float> %sub, <2 x float> undef, <2 x i32> zeroinitializer @@ -2075,12 +1824,9 @@ entry:  define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) {  ; CHECK-LABEL: test_vfmsq_lane_f32_0: -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_lane_f32_0: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %sub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v    %lane = shufflevector <2 x float> %sub, <2 x float> undef, <4 x i32> zeroinitializer @@ -2090,12 +1836,9 @@ entry:  define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) {  ; CHECK-LABEL: test_vfms_laneq_f32_0: -; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfms_laneq_f32_0: -; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v    %lane = shufflevector <4 x float> %sub, <4 x float> undef, <2 x i32> zeroinitializer @@ -2105,12 +1848,9 @@ entry:  define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) {  ; CHECK-LABEL: test_vfmsq_laneq_f32_0: -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_laneq_f32_0: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v    %lane = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> zeroinitializer @@ -2120,12 +1860,9 @@ entry:  define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {  ; CHECK-LABEL: test_vfmaq_laneq_f64_0: -; CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmaq_laneq_f64_0: -; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d  entry:    %lane = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer    %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %lane, <2 x double> %b, <2 x double> %a) @@ -2134,12 +1871,9 @@ entry:  define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) {  ; CHECK-LABEL: test_vfmsq_laneq_f64_0: -; CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vfmsq_laneq_f64_0: -; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d  entry:    %sub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v    %lane = shufflevector <2 x double> %sub, <2 x double> undef, <2 x i32> zeroinitializer @@ -2150,7 +1884,6 @@ entry:  define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlal_lane_s16_0:  ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2161,7 +1894,6 @@ entry:  define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlal_lane_s32_0:  ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2172,7 +1904,6 @@ entry:  define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlal_laneq_s16_0:  ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2183,7 +1914,6 @@ entry:  define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlal_laneq_s32_0:  ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2194,7 +1924,6 @@ entry:  define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlal_high_lane_s16_0:  ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2206,7 +1935,6 @@ entry:  define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlal_high_lane_s32_0:  ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2218,7 +1946,6 @@ entry:  define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlal_high_laneq_s16_0:  ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2230,7 +1957,6 @@ entry:  define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlal_high_laneq_s32_0:  ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2242,7 +1968,6 @@ entry:  define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlsl_lane_s16_0:  ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2253,7 +1978,6 @@ entry:  define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlsl_lane_s32_0:  ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2264,7 +1988,6 @@ entry:  define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlsl_laneq_s16_0:  ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2275,7 +1998,6 @@ entry:  define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlsl_laneq_s32_0:  ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2286,7 +2008,6 @@ entry:  define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlsl_high_lane_s16_0:  ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2298,7 +2019,6 @@ entry:  define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlsl_high_lane_s32_0:  ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2310,7 +2030,6 @@ entry:  define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlsl_high_laneq_s16_0:  ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2322,7 +2041,6 @@ entry:  define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlsl_high_laneq_s32_0:  ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2334,7 +2052,6 @@ entry:  define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlal_lane_u16_0:  ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2345,7 +2062,6 @@ entry:  define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlal_lane_u32_0:  ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2356,7 +2072,6 @@ entry:  define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlal_laneq_u16_0:  ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2367,7 +2082,6 @@ entry:  define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlal_laneq_u32_0:  ; CHECK: mlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2378,7 +2092,6 @@ entry:  define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlal_high_lane_u16_0:  ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2390,7 +2103,6 @@ entry:  define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlal_high_lane_u32_0:  ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2402,7 +2114,6 @@ entry:  define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlal_high_laneq_u16_0:  ; CHECK: mlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2414,7 +2125,6 @@ entry:  define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlal_high_laneq_u32_0:  ; CHECK: mlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2426,7 +2136,6 @@ entry:  define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlsl_lane_u16_0:  ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2437,7 +2146,6 @@ entry:  define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlsl_lane_u32_0:  ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2448,7 +2156,6 @@ entry:  define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlsl_laneq_u16_0:  ; CHECK: mlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2459,7 +2166,6 @@ entry:  define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlsl_laneq_u32_0:  ; CHECK: mlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2470,7 +2176,6 @@ entry:  define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vmlsl_high_lane_u16_0:  ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2482,7 +2187,6 @@ entry:  define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vmlsl_high_lane_u32_0:  ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2494,7 +2198,6 @@ entry:  define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) {  ; CHECK-LABEL: test_vmlsl_high_laneq_u16_0:  ; CHECK: mlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2506,7 +2209,6 @@ entry:  define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) {  ; CHECK-LABEL: test_vmlsl_high_laneq_u32_0:  ; CHECK: mlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2518,7 +2220,6 @@ entry:  define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vmull_lane_s16_0:  ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2528,7 +2229,6 @@ entry:  define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vmull_lane_s32_0:  ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2538,7 +2238,6 @@ entry:  define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vmull_lane_u16_0:  ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2548,7 +2247,6 @@ entry:  define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vmull_lane_u32_0:  ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2558,7 +2256,6 @@ entry:  define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vmull_high_lane_s16_0:  ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2569,7 +2266,6 @@ entry:  define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vmull_high_lane_s32_0:  ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2580,7 +2276,6 @@ entry:  define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vmull_high_lane_u16_0:  ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2591,7 +2286,6 @@ entry:  define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vmull_high_lane_u32_0:  ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2602,7 +2296,6 @@ entry:  define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vmull_laneq_s16_0:  ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2612,7 +2305,6 @@ entry:  define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vmull_laneq_s32_0:  ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2622,7 +2314,6 @@ entry:  define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vmull_laneq_u16_0:  ; CHECK: mull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer    %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2632,7 +2323,6 @@ entry:  define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vmull_laneq_u32_0:  ; CHECK: mull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer    %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2642,7 +2332,6 @@ entry:  define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vmull_high_laneq_s16_0:  ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2653,7 +2342,6 @@ entry:  define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vmull_high_laneq_s32_0:  ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2664,7 +2352,6 @@ entry:  define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vmull_high_laneq_u16_0:  ; CHECK: mull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2675,7 +2362,6 @@ entry:  define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vmull_high_laneq_u32_0:  ; CHECK: mull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2686,7 +2372,6 @@ entry:  define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vqdmlal_lane_s16_0:  ; CHECK: qdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer    %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2697,7 +2382,6 @@ entry:  define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vqdmlal_lane_s32_0:  ; CHECK: qdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer    %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2708,7 +2392,6 @@ entry:  define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vqdmlal_high_lane_s16_0:  ; CHECK: qdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2720,7 +2403,6 @@ entry:  define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vqdmlal_high_lane_s32_0:  ; CHECK: qdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2732,7 +2414,6 @@ entry:  define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vqdmlsl_lane_s16_0:  ; CHECK: qdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer    %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -2743,7 +2424,6 @@ entry:  define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vqdmlsl_lane_s32_0:  ; CHECK: qdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer    %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -2754,7 +2434,6 @@ entry:  define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) {  ; CHECK-LABEL: test_vqdmlsl_high_lane_s16_0:  ; CHECK: qdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2766,7 +2445,6 @@ entry:  define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) {  ; CHECK-LABEL: test_vqdmlsl_high_lane_s32_0:  ; CHECK: qdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2778,7 +2456,6 @@ entry:  define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vqdmull_lane_s16_0:  ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer    %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2788,7 +2465,6 @@ entry:  define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vqdmull_lane_s32_0:  ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer    %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2798,7 +2474,6 @@ entry:  define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vqdmull_laneq_s16_0:  ; CHECK: qdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer    %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -2808,7 +2483,6 @@ entry:  define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vqdmull_laneq_s32_0:  ; CHECK: qdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer    %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -2818,7 +2492,6 @@ entry:  define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vqdmull_high_lane_s16_0:  ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer @@ -2829,7 +2502,6 @@ entry:  define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vqdmull_high_lane_s32_0:  ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer @@ -2840,7 +2512,6 @@ entry:  define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) {  ; CHECK-LABEL: test_vqdmull_high_laneq_s16_0:  ; CHECK: qdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>    %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer @@ -2851,7 +2522,6 @@ entry:  define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) {  ; CHECK-LABEL: test_vqdmull_high_laneq_s32_0:  ; CHECK: qdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>    %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer @@ -2862,7 +2532,6 @@ entry:  define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vqdmulh_lane_s16_0:  ; CHECK: qdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer    %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) @@ -2872,7 +2541,6 @@ entry:  define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vqdmulhq_lane_s16_0:  ; CHECK: qdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer    %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) @@ -2882,7 +2550,6 @@ entry:  define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vqdmulh_lane_s32_0:  ; CHECK: qdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer    %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) @@ -2892,7 +2559,6 @@ entry:  define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vqdmulhq_lane_s32_0:  ; CHECK: qdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer    %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) @@ -2902,7 +2568,6 @@ entry:  define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vqrdmulh_lane_s16_0:  ; CHECK: qrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer    %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle) @@ -2912,7 +2577,6 @@ entry:  define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) {  ; CHECK-LABEL: test_vqrdmulhq_lane_s16_0:  ; CHECK: qrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer    %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle) @@ -2922,7 +2586,6 @@ entry:  define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vqrdmulh_lane_s32_0:  ; CHECK: qrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer    %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle) @@ -2932,7 +2595,6 @@ entry:  define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) {  ; CHECK-LABEL: test_vqrdmulhq_lane_s32_0:  ; CHECK: qrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret  entry:    %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer    %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle) @@ -2941,12 +2603,9 @@ entry:  define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) {  ; CHECK-LABEL: test_vmul_lane_f32_0: -; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_lane_f32_0: -; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer    %mul = fmul <2 x float> %shuffle, %a @@ -2955,12 +2614,9 @@ entry:  define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) {  ; CHECK-LABEL: test_vmulq_lane_f32_0: -; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_lane_f32_0: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer    %mul = fmul <4 x float> %shuffle, %a @@ -2969,12 +2625,9 @@ entry:  define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) {  ; CHECK-LABEL: test_vmul_laneq_f32_0: -; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_laneq_f32_0: -; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer    %mul = fmul <2 x float> %shuffle, %a @@ -2984,10 +2637,6 @@ entry:  define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) {  ; CHECK-LABEL: test_vmul_laneq_f64_0:  ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmul_laneq_f64_0: -; EXYNOS: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0] -; EXYNOS-NEXT: ret  entry:    %0 = bitcast <1 x double> %a to <8 x i8>    %1 = bitcast <8 x i8> %0 to double @@ -2999,12 +2648,9 @@ entry:  define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {  ; CHECK-LABEL: test_vmulq_laneq_f32_0: -; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_laneq_f32_0: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer    %mul = fmul <4 x float> %shuffle, %a @@ -3013,12 +2659,9 @@ entry:  define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {  ; CHECK-LABEL: test_vmulq_laneq_f64_0: -; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulq_laneq_f64_0: -; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d  entry:    %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer    %mul = fmul <2 x double> %shuffle, %a @@ -3027,12 +2670,9 @@ entry:  define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) {  ; CHECK-LABEL: test_vmulx_lane_f32_0: -; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulx_lane_f32_0: -; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer    %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -3041,12 +2681,9 @@ entry:  define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) {  ; CHECK-LABEL: test_vmulxq_lane_f32_0: -; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_lane_f32_0: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer    %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -3055,12 +2692,9 @@ entry:  define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) {  ; CHECK-LABEL: test_vmulxq_lane_f64_0: -; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_lane_f64_0: -; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d  entry:    %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer    %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -3069,12 +2703,9 @@ entry:  define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) {  ; CHECK-LABEL: test_vmulx_laneq_f32_0: -; CHECK: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulx_laneq_f32_0: -; EXYNOS: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[0] -; EXYNOS: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[0] +; EXYNOSM1: mulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer    %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -3083,12 +2714,9 @@ entry:  define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) {  ; CHECK-LABEL: test_vmulxq_laneq_f32_0: -; CHECK: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_laneq_f32_0: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[0] -; EXYNOS: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[0] +; EXYNOSM1: mulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer    %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle) @@ -3097,12 +2725,9 @@ entry:  define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) {  ; CHECK-LABEL: test_vmulxq_laneq_f64_0: -; CHECK: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] -; CHECK-NEXT: ret -; EXYNOS-LABEL: test_vmulxq_laneq_f64_0: -; EXYNOS: dup  [[x:v[0-9]+]].2d, {{v[0-9]+}}.d[0] -; EXYNOS: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[x]].2d -; EXYNOS-NEXT: ret +; GENERIC: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: dup  [[V:v[0-9]+]].2d, {{v[0-9]+}}.d[0] +; EXYNOSM1: mulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, [[V]].2d  entry:    %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer    %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle) @@ -3111,14 +2736,11 @@ entry:  define <4 x float> @optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {  ; CHECK-LABEL: optimize_dup: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK-NEXT: ret -; EXYNOS-LABEL: optimize_dup: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s  entry:    %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a) @@ -3130,15 +2752,12 @@ entry:  define <4 x float> @no_optimize_dup(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %v) {  ; CHECK-LABEL: no_optimize_dup: -; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] -; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret -; EXYNOS-LABEL: no_optimize_dup: -; EXYNOS: dup  [[x:v[0-9]+]].4s, {{v[0-9]+}}.s[3] -; EXYNOS: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[x]].4s -; EXYNOS: dup  [[y:v[0-9]+]].4s, {{v[0-9]+}}.s[1] -; EXYNOS: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[y]].4s -; EXYNOS-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3] +; GENERIC: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: dup  [[V:v[0-9]+]].4s, {{v[0-9]+}}.s[3] +; EXYNOSM1: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[V]].4s +; EXYNOSM1: dup  [[W:v[0-9]+]].4s, {{v[0-9]+}}.s[1] +; EXYNOSM1: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, [[W]].4s  entry:    %lane1 = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>    %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %lane1, <4 x float> %b, <4 x float> %a) @@ -3150,8 +2769,7 @@ entry:  define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_a57(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="cortex-a57" {  ; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_a57: -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] -; CHECK-NEXT: ret +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]  entry:    %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>    %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -3160,9 +2778,8 @@ entry:  define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_m1(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="exynos-m1" {  ; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m1: -; CHECK: dup  [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] -; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s -; CHECK-NEXT: ret +; GENERIC: dup  [[V:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; GENERIC: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[V]].2s  entry:    %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>    %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) diff --git a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll index 453334dce601..2fb9d3b2d030 100644 --- a/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll +++ b/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll @@ -87,4 +87,13 @@ for.end:    ret double %v0  } +define <2 x i64> @t6() { +; ALL-LABEL: t6: +; CYCLONE: movi.16b v0, #0 +; KRYO: movi v0.2d, #0000000000000000 +; FALKOR: movi v0.2d, #0000000000000000 + ret <2 x i64> zeroinitializer +} + +  declare double @sin(double) diff --git a/test/CodeGen/AArch64/chkstk.ll b/test/CodeGen/AArch64/chkstk.ll new file mode 100644 index 000000000000..1c2e5528f10c --- /dev/null +++ b/test/CodeGen/AArch64/chkstk.ll @@ -0,0 +1,25 @@ +; RUN: llc -mtriple=aarch64-windows -verify-machineinstrs %s -o - \ +; RUN:  | FileCheck -check-prefix CHECK-DEFAULT-CODE-MODEL %s + +; RUN: llc -mtriple=aarch64-windows -verify-machineinstrs -code-model=large %s -o - \ +; RUN:  | FileCheck -check-prefix CHECK-LARGE-CODE-MODEL %s + +define void @check_watermark() { +entry: +  %buffer = alloca [4096 x i8], align 1 +  ret void +} + +; CHECK-DEFAULT-CODE-MODEL: check_watermark: +; CHECK-DEFAULT-CODE-MODEL-DAG: stp x29, x30, [sp +; CHECK-DEFAULT-CODE-MODEL-DAG: orr x15, xzr, #0x100 +; CHECK-DEFAULT-CODE-MODEL:     bl __chkstk +; CHECK-DEFAULT-CODE-MODEL:     sub sp, sp, x15, lsl #4 + +; CHECK-LARGE-CODE-MODEL: check_watermark: +; CHECK-LARGE-CODE-MODEL-DAG: stp x29, x30, [sp +; CHECK-LARGE-CODE-MODEL-DAG: orr x15, xzr, #0x100 +; CHECK-LARGE-CODE-MODEL-DAG: adrp x16, __chkstk +; CHECK-LARGE-CODE-MODEL-DAG: add x16, x16, __chkstk +; CHECK-LARGE-CODE-MODEL:     blr x16 +; CHECK-LARGE-CODE-MODEL:     sub sp, sp, x15, lsl #4 diff --git a/test/CodeGen/AArch64/ldst-paired-aliasing.ll b/test/CodeGen/AArch64/ldst-paired-aliasing.ll index 9c698b5fdcc6..9b0b51d369a3 100644 --- a/test/CodeGen/AArch64/ldst-paired-aliasing.ll +++ b/test/CodeGen/AArch64/ldst-paired-aliasing.ll @@ -10,11 +10,10 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #3  define i32 @main() local_unnamed_addr #1 {  ; Make sure the stores happen in the correct order (the exact instructions could change).  ; CHECK-LABEL: main: -; CHECK: stp xzr, xzr, [sp, #72] +; CHECK: str xzr, [sp, #80]  ; CHECK: str w9, [sp, #80] -; CHECK: str q0, [sp, #48] +; CHECK: stp q0, q0, [sp, #48]  ; CHECK: ldr w8, [sp, #48] -; CHECK: str q0, [sp, #64]  for.body.lr.ph.i.i.i.i.i.i63:    %b1 = alloca [10 x i32], align 16 diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index 228d3c7d4306..71c4c83c28f9 100644 --- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -251,7 +251,8 @@ entry:  ; R600: MOVA_INT -; SI-PROMOTE-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: +; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: +; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding:  ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0  ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0 diff --git a/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll b/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll new file mode 100644 index 000000000000..f97785beab6f --- /dev/null +++ b/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll @@ -0,0 +1,32 @@ +; RUN: llc -mtriple=amdgcn--amdhsa-amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Effectively, check that the compile finishes; in the case +; of an infinite loop, llc toggles between merging 2 ST4s +; ( MergeConsecutiveStores() ) and breaking the resulting ST8 +; apart ( LegalizeStoreOps() ). + +target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" + +; GCN-LABEL: {{^}}_Z6brokenPd: +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} +; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} +define amdgpu_kernel void @_Z6brokenPd(double* %arg) { +bb: +  %tmp = alloca double, align 8, addrspace(5) +  %tmp1 = alloca double, align 8, addrspace(5) +  %tmp2 = load double, double* %arg, align 8 +  br i1 1, label %bb6, label %bb4 + +bb3:                                             ; No predecessors! +  br label %bb4 + +bb4:                                             ; preds = %bb3, %bb +  %tmp5 = phi double addrspace(5)* [ %tmp1, %bb3 ], [ %tmp, %bb ] +  store double %tmp2, double addrspace(5)* %tmp5, align 8 +  br label %bb6 + +bb6:                                             ; preds = %bb4, %bb +  %tmp7 = phi double [ 0x7FF8123000000000, %bb4 ], [ 0x7FF8000000000000, %bb ] +  store double %tmp7, double* %arg, align 8 +  ret void +} diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir index 7c2666e3680f..8b9b83f6d0e7 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir @@ -6,6 +6,7 @@    define void @test_trunc_and_zext_s16() { ret void }    define void @test_trunc_and_anyext_s8() { ret void }    define void @test_trunc_and_anyext_s16() { ret void } +  define void @test_trunc_s64() #0 { ret void }    define void @test_add_s32() { ret void }    define void @test_add_fold_imm_s32() { ret void } @@ -46,6 +47,10 @@    define void @test_gep() { ret void }    define void @test_constant_imm() { ret void }    define void @test_constant_cimm() { ret void } +  define void @test_pointer_constant() { ret void } + +  define void @test_inttoptr_s32() { ret void } +  define void @test_ptrtoint_s32() { ret void }    define void @test_select_s32() { ret void }    define void @test_select_ptr() { ret void } @@ -241,6 +246,36 @@ body:             |      ; CHECK: BX_RET 14, %noreg, implicit %r0  ...  --- +name:            test_trunc_s64 +# CHECK-LABEL: name: test_trunc_s64 +legalized:       true +regBankSelected: true +selected:        false +# CHECK: selected: true +registers: +  - { id: 0, class: fprb } +  - { id: 1, class: gprb } +  - { id: 2, class: gprb } +body:             | +  bb.0: +    liveins: %r0, %d0 + +    %0(s64) = COPY %d0 +    ; CHECK: [[VREG:%[0-9]+]]:dpr = COPY %d0 + +    %2(p0) = COPY %r0 +    ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY %r0 + +    %1(s32) = G_TRUNC %0(s64) +    ; CHECK: [[VREGTRUNC:%[0-9]+]]:gpr, [[UNINTERESTING:%[0-9]+]]:gpr = VMOVRRD [[VREG]] + +    G_STORE %1(s32), %2 :: (store 4) +    ; CHECK: STRi12 [[VREGTRUNC]], [[PTR]], 0, 14, %noreg + +    BX_RET 14, %noreg +    ; CHECK: BX_RET 14, %noreg +... +---  name:            test_add_s32  # CHECK-LABEL: name: test_add_s32  legalized:       true @@ -1075,6 +1110,71 @@ body:             |      BX_RET 14, %noreg, implicit %r0  ...  --- +name:            test_pointer_constant +# CHECK-LABEL: name: test_pointer_constant +legalized:       true +regBankSelected: true +selected:        false +# CHECK: selected: true +registers: +  - { id: 0, class: gprb } +body:             | +  bb.0: +    %0(p0) = G_CONSTANT i32 0 +    ; CHECK: %[[C:[0-9]+]]:gpr = MOVi 0, 14, %noreg, %noreg + +    %r0 = COPY %0(p0) +    BX_RET 14, %noreg, implicit %r0 +... +--- +name:            test_inttoptr_s32 +# CHECK-LABEL: name: test_inttoptr_s32 +legalized:       true +regBankSelected: true +selected:        false +# CHECK: selected: true +registers: +  - { id: 0, class: gprb } +  - { id: 1, class: gprb } +body:             | +  bb.0: +    liveins: %r0 + +    %0(s32) = COPY %r0 +    %1(p0) = G_INTTOPTR %0(s32) +    ; CHECK: [[INT:%[0-9]+]]:gpr = COPY %r0 +    ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY [[INT]] + +    %r0 = COPY %1(p0) +    ; CHECK: %r0 = COPY [[PTR]] + +    BX_RET 14, %noreg, implicit %r0 +... +--- +name:            test_ptrtoint_s32 +# CHECK-LABEL: name: test_ptrtoint_s32 +legalized:       true +regBankSelected: true +selected:        false +# CHECK: selected: true +registers: +  - { id: 0, class: gprb } +  - { id: 1, class: gprb } +body:             | +  bb.0: +    liveins: %r0 + +    %0(p0) = COPY %r0 +    %1(s32) = G_PTRTOINT %0(p0) +    ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY %r0 +    ; CHECK: [[INT:%[0-9]+]]:gpr = COPY [[PTR]] + +    %r0 = COPY %1(s32) +    ; CHECK: %r0 = COPY [[INT]] + +    BX_RET 14, %noreg, implicit %r0 +... +---  name:            test_select_s32  # CHECK-LABEL: name: test_select_s32  legalized:       true diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir index e3e206cf76e9..204434e981b4 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir @@ -3,6 +3,9 @@    define void @test_sext_s8() { ret void }    define void @test_zext_s16() { ret void } +  define void @test_inttoptr_s32() { ret void } +  define void @test_ptrtoint_s32() { ret void } +    define void @test_add_s8() { ret void }    define void @test_add_s16() { ret void }    define void @test_add_s32() { ret void } @@ -101,6 +104,50 @@ body:             |      BX_RET 14, %noreg, implicit %r0  ...  --- +name:            test_inttoptr_s32 +# CHECK-LABEL: name: test_inttoptr_s32 +legalized:       false +# CHECK: legalized: true +regBankSelected: false +selected:        false +tracksRegLiveness: true +registers: +  - { id: 0, class: _ } +  - { id: 1, class: _ } +body:             | +  bb.0: +    liveins: %r0 + +    %0(s32) = COPY %r0 +    %1(p0) = G_INTTOPTR %0(s32) +    ; G_INTTOPTR with s32 is legal, so we should find it unchanged in the output +    ; CHECK: {{%[0-9]+}}:_(p0) = G_INTTOPTR {{%[0-9]+}} +    %r0 = COPY %1(p0) +    BX_RET 14, %noreg, implicit %r0 +... +--- +name:            test_ptrtoint_s32 +# CHECK-LABEL: name: test_ptrtoint_s32 +legalized:       false +# CHECK: legalized: true +regBankSelected: false +selected:        false +tracksRegLiveness: true +registers: +  - { id: 0, class: _ } +  - { id: 1, class: _ } +body:             | +  bb.0: +    liveins: %r0 + +    %0(p0) = COPY %r0 +    %1(s32) = G_PTRTOINT %0(p0) +    ; G_PTRTOINT with s32 is legal, so we should find it unchanged in the output +    ; CHECK: {{%[0-9]+}}:_(s32) = G_PTRTOINT {{%[0-9]+}} +    %r0 = COPY %1(s32) +    BX_RET 14, %noreg, implicit %r0 +... +---  name:            test_add_s8  # CHECK-LABEL: name: test_add_s8  legalized:       false @@ -826,6 +873,7 @@ registers:    - { id: 2, class: _ }    - { id: 3, class: _ }    - { id: 4, class: _ } +  - { id: 5, class: _ }  body:             |    bb.0:      liveins: %r0 @@ -856,6 +904,10 @@ body:             |      ; CHECK: {{%[0-9]+}}:_(s1) = G_TRUNC [[EXT]](s32)      ; CHECK-NOT: G_CONSTANT i1 +    %5(p0) = G_CONSTANT 0 +    G_STORE %5(p0), %4(p0) :: (store 4) +    ; CHECK: {{%[0-9]+}}:_(p0) = G_CONSTANT 0 +      %r0 = COPY %0(s32)      BX_RET 14, %noreg, implicit %r0  ... diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir index 044740e33a2d..175333626f97 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir @@ -24,6 +24,9 @@    define void @test_constants() { ret void } +  define void @test_inttoptr_s32() { ret void } +  define void @test_ptrtoint_s32() { ret void } +    @a_global = global float 1.0    define void @test_globals() { ret void } @@ -31,6 +34,7 @@    define void @test_anyext_s16_32() { ret void }    define void @test_trunc_s32_16() { ret void } +  define void @test_trunc_s64_32() #0 { ret void }    define void @test_icmp_eq_s32() { ret void }    define void @test_fcmp_one_s32() #0 { ret void } @@ -496,6 +500,44 @@ body:             |      BX_RET 14, %noreg, implicit %r0  ...  --- +name:            test_inttoptr_s32 +# CHECK-LABEL: name: test_inttoptr_s32 +legalized:       true +regBankSelected: false +selected:        false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb, preferred-register: '' } +# CHECK: - { id: 1, class: gprb, preferred-register: '' } +registers: +  - { id: 0, class: _ } +  - { id: 1, class: _ } +body:             | +  bb.0: +    %0(s32) = COPY %r0 +    %1(p0) = G_INTTOPTR %0(s32) +    %r0 = COPY %1(p0) +    BX_RET 14, %noreg, implicit %r0 +... +--- +name:            test_ptrtoint_s32 +# CHECK-LABEL: name: test_ptrtoint_s32 +legalized:       true +regBankSelected: false +selected:        false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb, preferred-register: '' } +# CHECK: - { id: 1, class: gprb, preferred-register: '' } +registers: +  - { id: 0, class: _ } +  - { id: 1, class: _ } +body:             | +  bb.0: +    %0(p0) = COPY %r0 +    %1(s32) = G_PTRTOINT %0(p0) +    %r0 = COPY %1(s32) +    BX_RET 14, %noreg, implicit %r0 +... +---  name:            test_globals  # CHECK-LABEL: name: test_globals  legalized:       true @@ -584,6 +626,30 @@ body:             |      BX_RET 14, %noreg  ...  --- +name:            test_trunc_s64_32 +# CHECK-LABEL: name: test_trunc_s64_32 +legalized:       true +regBankSelected: false +selected:        false +# CHECK: registers: +# CHECK: - { id: 0, class: fprb, preferred-register: '' } +# CHECK: - { id: 1, class: gprb, preferred-register: '' } +# CHECK: - { id: 2, class: gprb, preferred-register: '' } +registers: +  - { id: 0, class: _ } +  - { id: 1, class: _ } +  - { id: 2, class: _ } +body:             | +  bb.0: +    liveins: %r0, %d0 + +    %0(s64) = COPY %d0 +    %2(p0) = COPY %r0 +    %1(s32) = G_TRUNC %0(s64) +    G_STORE %1(s32), %2 :: (store 4) +    BX_RET 14, %noreg +... +---  name:            test_icmp_eq_s32  # CHECK-LABEL: name: test_icmp_eq_s32  legalized:       true diff --git a/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/test/CodeGen/ARM/avoid-cpsr-rmw.ll index 78d3ebf371a4..9373c5d44210 100644 --- a/test/CodeGen/ARM/avoid-cpsr-rmw.ll +++ b/test/CodeGen/ARM/avoid-cpsr-rmw.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX -; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift     | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 -simplifycfg-sink-common=false | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift     -simplifycfg-sink-common=false | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT  ; Avoid some 's' 16-bit instruction which partially update CPSR (and add false  ; dependency) when it isn't dependent on last CPSR defining instruction.  ; rdar://8928208 diff --git a/test/CodeGen/ARM/su-addsub-overflow.ll b/test/CodeGen/ARM/su-addsub-overflow.ll new file mode 100644 index 000000000000..eef531282033 --- /dev/null +++ b/test/CodeGen/ARM/su-addsub-overflow.ll @@ -0,0 +1,135 @@ +; RUN: llc < %s -mtriple=arm-eabi -mcpu=generic | FileCheck %s + +define i32 @sadd(i32 %a, i32 %b) local_unnamed_addr #0 { +; CHECK-LABEL: sadd: +; CHECK:    mov r[[R0:[0-9]+]], r0 +; CHECK-NEXT:    add r[[R1:[0-9]+]], r[[R0]], r1 +; CHECK-NEXT:    cmp r[[R1]], r[[R0]] +; CHECK-NEXT:    movvc pc, lr +entry: +  %0 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) +  %1 = extractvalue { i32, i1 } %0, 1 +  br i1 %1, label %trap, label %cont + +trap: +  tail call void @llvm.trap() #2 +  unreachable + +cont: +  %2 = extractvalue { i32, i1 } %0, 0 +  ret i32 %2 + +} + +define i32 @uadd(i32 %a, i32 %b) local_unnamed_addr #0 { +; CHECK-LABEL: uadd: +; CHECK:    mov r[[R0:[0-9]+]], r0 +; CHECK-NEXT:    adds r[[R1:[0-9]+]], r[[R0]], r1 +; CHECK-NEXT:    cmp r[[R1]], r[[R0]] +; CHECK-NEXT:    movhs pc, lr +entry: +  %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) +  %1 = extractvalue { i32, i1 } %0, 1 +  br i1 %1, label %trap, label %cont + +trap: +  tail call void @llvm.trap() #2 +  unreachable + +cont: +  %2 = extractvalue { i32, i1 } %0, 0 +  ret i32 %2 + +} + +define i32 @ssub(i32 %a, i32 %b) local_unnamed_addr #0 { +; CHECK-LABEL: ssub: +; CHECK:    cmp r0, r1 +; CHECK-NEXT:    subvc r0, r0, r1 +; CHECK-NEXT:    movvc pc, lr +entry: +  %0 = tail call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) +  %1 = extractvalue { i32, i1 } %0, 1 +  br i1 %1, label %trap, label %cont + +trap: +  tail call void @llvm.trap() #2 +  unreachable + +cont: +  %2 = extractvalue { i32, i1 } %0, 0 +  ret i32 %2 + +} + +define i32 @usub(i32 %a, i32 %b) local_unnamed_addr #0 { +; CHECK-LABEL: usub: +; CHECK:    mov r[[R0:[0-9]+]], r0 +; CHECK-NEXT:    subs r[[R1:[0-9]+]], r[[R0]], r1 +; CHECK-NEXT:    cmp r[[R0]], r1 +; CHECK-NEXT:    movhs pc, lr +entry: +  %0 = tail call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b) +  %1 = extractvalue { i32, i1 } %0, 1 +  br i1 %1, label %trap, label %cont + +trap: +  tail call void @llvm.trap() #2 +  unreachable + +cont: +  %2 = extractvalue { i32, i1 } %0, 0 +  ret i32 %2 + +} + +define void @sum(i32* %a, i32* %b, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: sum: +; CHECK:    ldr [[R0:r[0-9]+]], +; CHECK-NEXT:    ldr [[R1:r[0-9]+|lr]], +; CHECK-NEXT:    add [[R2:r[0-9]+]], [[R1]], [[R0]] +; CHECK-NEXT:    cmp [[R2]], [[R1]] +; CHECK-NEXT:    strvc [[R2]], +; CHECK-NEXT:    addvc +; CHECK-NEXT:    cmpvc +; CHECK-NEXT:    bvs +entry: +  %cmp7 = icmp eq i32 %n, 0 +  br i1 %cmp7, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: +  ret void + +for.body: +  %i.08 = phi i32 [ %7, %cont2 ], [ 0, %entry ] +  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.08 +  %0 = load i32, i32* %arrayidx, align 4 +  %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.08 +  %1 = load i32, i32* %arrayidx1, align 4 +  %2 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %1, i32 %0) +  %3 = extractvalue { i32, i1 } %2, 1 +  br i1 %3, label %trap, label %cont + +trap: +  tail call void @llvm.trap() #2 +  unreachable + +cont: +  %4 = extractvalue { i32, i1 } %2, 0 +  store i32 %4, i32* %arrayidx1, align 4 +  %5 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %i.08, i32 1) +  %6 = extractvalue { i32, i1 } %5, 1 +  br i1 %6, label %trap, label %cont2 + +cont2: +  %7 = extractvalue { i32, i1 } %5, 0 +  %cmp = icmp eq i32 %7, %n +  br i1 %cmp, label %for.cond.cleanup, label %for.body + +} + +declare void @llvm.trap() #2 +declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) #1 +declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1 +declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) #1 +declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1 diff --git a/test/CodeGen/ARM/usat.ll b/test/CodeGen/ARM/usat.ll new file mode 100644 index 000000000000..8f19d11ef7bb --- /dev/null +++ b/test/CodeGen/ARM/usat.ll @@ -0,0 +1,214 @@ +; RUN: llc -mtriple=armv4t-eabi %s -o -  | FileCheck %s --check-prefix=CHECK --check-prefix=V4T +; RUN: llc -mtriple=armv6-eabi %s -o -   | FileCheck %s --check-prefix=CHECK --check-prefix=V6 +; RUN: llc -mtriple=armv6t2-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=V6T2 + +; Check for several conditions that should result in USAT. +; For example, the base test is equivalent to +; x < 0 ? 0 : (x > k ? k : x) in C. All patterns that bound x +; to the interval [0, k] where k + 1 is a power of 2 can be +; transformed into USAT. At the end there are some tests +; checking that conditionals are not transformed if they don't +; match the right pattern. + +; +; Base tests with different bit widths +; + +; x < 0 ? 0 : (x > k ? k : x) +; 32-bit base test +define i32 @unsigned_sat_base_32bit(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_base_32bit: +; V6: usat r0, #23, r0 +; V6T2: usat r0, #23, r0 +; V4T-NOT: usat +entry: +  %cmpLow = icmp slt i32 %x, 0 +  %cmpUp = icmp sgt i32 %x, 8388607 +  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %x +  %saturateLow = select i1 %cmpLow, i32 0, i32 %saturateUp +  ret i32 %saturateLow +} + +; x < 0 ? 0 : (x > k ? k : x) +; 16-bit base test +define i16 @unsigned_sat_base_16bit(i16 %x) #0 { +; CHECK-LABEL: unsigned_sat_base_16bit: +; V6: usat r0, #11, r0 +; V6T2: usat r0, #11, r0 +; V4T-NOT: usat +entry: +  %cmpLow = icmp slt i16 %x, 0 +  %cmpUp = icmp sgt i16 %x, 2047 +  %saturateUp = select i1 %cmpUp, i16 2047, i16 %x +  %saturateLow = select i1 %cmpLow, i16 0, i16 %saturateUp +  ret i16 %saturateLow +} + +; x < 0 ? 0 : (x > k ? k : x) +; 8-bit base test +define i8 @unsigned_sat_base_8bit(i8 %x) #0 { +; CHECK-LABEL: unsigned_sat_base_8bit: +; V6: usat r0, #5, r0 +; V6T2: usat r0, #5, r0 +; V4T-NOT: usat +entry: +  %cmpLow = icmp slt i8 %x, 0 +  %cmpUp = icmp sgt i8 %x, 31 +  %saturateUp = select i1 %cmpUp, i8 31, i8 %x +  %saturateLow = select i1 %cmpLow, i8 0, i8 %saturateUp +  ret i8 %saturateLow +} + +; +; Tests where the conditionals that check for upper and lower bounds, +; or the < and > operators, are arranged in different ways. Only some +; of the possible combinations that lead to USAT are tested. +; +; x < 0 ? 0 : (x < k ? x : k) +define i32 @unsigned_sat_lower_upper_1(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_lower_upper_1: +; V6: usat r0, #23, r0 +; V6T2: usat r0, #23, r0 +; V4T-NOT: usat +entry: +  %cmpLow = icmp slt i32 %x, 0 +  %cmpUp = icmp slt i32 %x, 8388607 +  %saturateUp = select i1 %cmpUp, i32 %x, i32 8388607 +  %saturateLow = select i1 %cmpLow, i32 0, i32 %saturateUp +  ret i32 %saturateLow +} + +; x > 0 ? (x > k ? k : x) : 0 +define i32 @unsigned_sat_lower_upper_2(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_lower_upper_2: +; V6: usat    r0, #23, r0 +; V6T2: usat    r0, #23, r0 +; V4T-NOT: usat +entry: +  %cmpLow = icmp sgt i32 %x, 0 +  %cmpUp = icmp sgt i32 %x, 8388607 +  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %x +  %saturateLow = select i1 %cmpLow, i32 %saturateUp, i32 0 +  ret i32 %saturateLow +} + +; x < k ? (x < 0 ? 0 : x) : k +define i32 @unsigned_sat_upper_lower_1(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_upper_lower_1: +; V6: usat    r0, #23, r0 +; V6T2: usat    r0, #23, r0 +; V4T-NOT: usat +entry: +  %cmpUp = icmp slt i32 %x, 8388607 +  %cmpLow = icmp slt i32 %x, 0 +  %saturateLow = select i1 %cmpLow, i32 0, i32 %x +  %saturateUp = select i1 %cmpUp, i32 %saturateLow, i32 8388607 +  ret i32 %saturateUp +} + +; x > k ? k : (x < 0 ? 0 : x) +define i32 @unsigned_sat_upper_lower_2(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_upper_lower_2: +; V6: usat    r0, #23, r0 +; V6T2: usat    r0, #23, r0 +; V4T-NOT: usat +entry: +  %cmpUp = icmp sgt i32 %x, 8388607 +  %cmpLow = icmp slt i32 %x, 0 +  %saturateLow = select i1 %cmpLow, i32 0, i32 %x +  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow +  ret i32 %saturateUp +} + +; k < x ? k : (x > 0 ? x : 0) +define i32 @unsigned_sat_upper_lower_3(i32 %x) #0 { +; CHECK-LABEL: unsigned_sat_upper_lower_3: +; V6: usat    r0, #23, r0 +; V6T2: usat    r0, #23, r0 +; V4T-NOT: usat +entry: +  %cmpUp = icmp slt i32 8388607, %x +  %cmpLow = icmp sgt i32 %x, 0 +  %saturateLow = select i1 %cmpLow, i32 %x, i32 0 +  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow +  ret i32 %saturateUp +} + +; +; The following tests check for patterns that should not transform +; into USAT but are similar enough that could confuse the selector. +; +; x > k ? k : (x > 0 ? 0 : x) +; First condition upper-saturates, second doesn't lower-saturate. +define i32 @no_unsigned_sat_missing_lower(i32 %x) #0 { +; CHECK-LABEL: no_unsigned_sat_missing_lower +; CHECK-NOT: usat +entry: +  %cmpUp = icmp sgt i32 %x, 8388607 +  %cmpLow = icmp sgt i32 %x, 0 +  %saturateLow = select i1 %cmpLow, i32 0, i32 %x +  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow +  ret i32 %saturateUp +} + +; x < k ? k : (x < 0 ? 0 : x) +; Second condition lower-saturates, first doesn't upper-saturate. +define i32 @no_unsigned_sat_missing_upper(i32 %x) #0 { +; CHECK-LABEL: no_unsigned_sat_missing_upper: +; CHECK-NOT: usat +entry: +  %cmpUp = icmp slt i32 %x, 8388607 +  %cmpLow = icmp slt i32 %x, 0 +  %saturateLow = select i1 %cmpLow, i32 0, i32 %x +  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow +  ret i32 %saturateUp +} + +; Lower constant is different in the select and in the compare +define i32 @no_unsigned_sat_incorrect_constant(i32 %x) #0 { +; CHECK-LABEL: no_unsigned_sat_incorrect_constant: +; CHECK-NOT: usat +entry: +  %cmpUp = icmp sgt i32 %x, 8388607 +  %cmpLow = icmp slt i32 %x, 0 +  %saturateLow = select i1 %cmpLow, i32 -1, i32 %x +  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow +  ret i32 %saturateUp +} + +; The interval is not [0, k] +define i32 @no_unsigned_sat_incorrect_interval(i32 %x) #0 { +; CHECK-LABEL: no_unsigned_sat_incorrect_interval: +; CHECK-NOT: usat +entry: +  %cmpUp = icmp sgt i32 %x, 8388607 +  %cmpLow = icmp slt i32 %x, -4 +  %saturateLow = select i1 %cmpLow, i32 -4, i32 %x +  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow +  ret i32 %saturateUp +} + +; The returned value (y) is not the same as the tested value (x). +define i32 @no_unsigned_sat_incorrect_return(i32 %x, i32 %y) #0 { +; CHECK-LABEL: no_unsigned_sat_incorrect_return: +; CHECK-NOT: usat +entry: +  %cmpUp = icmp sgt i32 %x, 8388607 +  %cmpLow = icmp slt i32 %x, 0 +  %saturateLow = select i1 %cmpLow, i32 0, i32 %y +  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow +  ret i32 %saturateUp +} + +; One of the values in a compare (y) is not the same as the rest +; of the compare and select values (x). +define i32 @no_unsigned_sat_incorrect_compare(i32 %x, i32 %y) #0 { +; CHECK-LABEL: no_unsigned_sat_incorrect_compare: +; CHECK-NOT: usat +entry: +  %cmpUp = icmp sgt i32 %x, 8388607 +  %cmpLow = icmp slt i32 %y, 0 +  %saturateLow = select i1 %cmpLow, i32 0, i32 %x +  %saturateUp = select i1 %cmpUp, i32 8388607, i32 %saturateLow +  ret i32 %saturateUp +} diff --git a/test/CodeGen/BPF/objdump_imm_hex.ll b/test/CodeGen/BPF/objdump_imm_hex.ll new file mode 100644 index 000000000000..a245a6c791f2 --- /dev/null +++ b/test/CodeGen/BPF/objdump_imm_hex.ll @@ -0,0 +1,65 @@ +; RUN: llc -march=bpfel -filetype=obj -o - %s | llvm-objdump -d - | FileCheck --check-prefix=CHECK-DEC %s +; RUN: llc -march=bpfel -filetype=obj -o - %s | llvm-objdump -d -print-imm-hex - | FileCheck --check-prefix=CHECK-HEX %s + +; Source Code: +; int gbl; +; int test(unsigned long long a, unsigned long long b) { +;   int ret = 0; +;   if (a == 0xABCDABCDabcdabcdULL) { +;     gbl = gbl * gbl * 2; +;     ret = 1; +;     goto out; +;   } +;   if (b == 0xABCDabcdabcdULL) { +;     gbl = gbl * 4; +;     ret = 2; +;   } +;  out: +;   return ret; +; } + +@gbl = common local_unnamed_addr global i32 0, align 4 + +; Function Attrs: norecurse nounwind +define i32 @test(i64, i64) local_unnamed_addr #0 { +; CHECK-LABEL: test +  %3 = icmp eq i64 %0, -6067004223159161907 +  br i1 %3, label %4, label %8 +; CHECK-DEC: 18 03 00 00 cd ab cd ab 00 00 00 00 cd ab cd ab         r3 = -6067004223159161907 ll +; CHECK-DEC: 5d 31 07 00 00 00 00 00         if r1 != r3 goto +7 +; CHECK-HEX: 18 03 00 00 cd ab cd ab 00 00 00 00 cd ab cd ab         r3 = -0x5432543254325433 ll +; CHECK-HEX: 5d 31 07 00 00 00 00 00         if r1 != r3 goto +0x7 + +; <label>:4:                                      ; preds = %2 +  %5 = load i32, i32* @gbl, align 4 +  %6 = shl i32 %5, 1 +; CHECK-DEC: 67 01 00 00 01 00 00 00         r1 <<= 1 +; CHECK-HEX: 67 01 00 00 01 00 00 00         r1 <<= 0x1 +  %7 = mul i32 %6, %5 +  br label %13 + +; <label>:8:                                      ; preds = %2 +  %9 = icmp eq i64 %1, 188899839028173 +; CHECK-DEC: 18 01 00 00 cd ab cd ab 00 00 00 00 cd ab 00 00         r1 = 188899839028173 ll +; CHECK-HEX: 18 01 00 00 cd ab cd ab 00 00 00 00 cd ab 00 00         r1 = 0xabcdabcdabcd ll +  br i1 %9, label %10, label %16 + +; <label>:10:                                     ; preds = %8 +  %11 = load i32, i32* @gbl, align 4 +  %12 = shl nsw i32 %11, 2 +  br label %13 + +; <label>:13:                                     ; preds = %4, %10 +  %14 = phi i32 [ %12, %10 ], [ %7, %4 ] +  %15 = phi i32 [ 2, %10 ], [ 1, %4 ] +  store i32 %14, i32* @gbl, align 4 +; CHECK-DEC: 63 12 00 00 00 00 00 00         *(u32 *)(r2 + 0) = r1 +; CHECK-HEX: 63 12 00 00 00 00 00 00         *(u32 *)(r2 + 0x0) = r1 +  br label %16 + +; <label>:16:                                     ; preds = %13, %8 +  %17 = phi i32 [ 0, %8 ], [ %15, %13 ] +  ret i32 %17 +} + +attributes #0 = { norecurse nounwind } diff --git a/test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll b/test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll new file mode 100644 index 000000000000..f96dbf2af496 --- /dev/null +++ b/test/CodeGen/Hexagon/autohvx/build-vector-i32-type.ll @@ -0,0 +1,19 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; Check that this doesn't crash. +; CHECK: sfcmp + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +define void @fred() #0 { +b0: +  %v1 = load <16 x float>, <16 x float>* null, align 8 +  %v2 = fcmp olt <16 x float> undef, %v1 +  %v3 = select <16 x i1> %v2, <16 x i16> undef, <16 x i16> zeroinitializer +  %v4 = sext <16 x i16> %v3 to <16 x i32> +  store <16 x i32> %v4, <16 x i32>* undef, align 64 +  unreachable +} + +attributes #0 = { noinline norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx-length64b,+hvxv60" } diff --git a/test/CodeGen/Hexagon/autohvx/isel-bool-vector.ll b/test/CodeGen/Hexagon/autohvx/isel-bool-vector.ll new file mode 100644 index 000000000000..4cbd00837fc6 --- /dev/null +++ b/test/CodeGen/Hexagon/autohvx/isel-bool-vector.ll @@ -0,0 +1,18 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; Check that this testcase doesn't crash. +; CHECK: sfcmp + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +define void @fred() #0 { +b0: +  %v1 = fcmp olt <16 x float> zeroinitializer, undef +  %v2 = select <16 x i1> %v1, <16 x i16> undef, <16 x i16> zeroinitializer +  %v3 = sext <16 x i16> %v2 to <16 x i32> +  store <16 x i32> %v3, <16 x i32>* undef, align 128 +  unreachable +} + +attributes #0 = { noinline norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx-length64b" } diff --git a/test/CodeGen/Hexagon/autohvx/isel-select-const.ll b/test/CodeGen/Hexagon/autohvx/isel-select-const.ll new file mode 100644 index 000000000000..c251292c9da4 --- /dev/null +++ b/test/CodeGen/Hexagon/autohvx/isel-select-const.ll @@ -0,0 +1,32 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; Check that this doesn't crash. +; CHECK: vlut32 + +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon-unknown--elf" + +define void @fred() #0 { +b0: +  %v1 = tail call <16 x i32> @llvm.hexagon.V6.vlutvvb.oracc(<16 x i32> undef, <16 x i32> <i32 151388928, i32 353505036, i32 555621144, i32 757737252, i32 959853360, i32 1161969468, i32 1364085576, i32 1566201684, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> undef, i32 3) +  %v2 = bitcast <16 x i32> %v1 to <64 x i8> +  %v3 = shufflevector <64 x i8> %v2, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> +  %v4 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %v3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> +  %v5 = bitcast <64 x i8> %v4 to <16 x i32> +  %v6 = tail call <16 x i32> @llvm.hexagon.V6.vshuffb(<16 x i32> %v5) +  store <16 x i32> %v6, <16 x i32>* undef, align 1 +  %v7 = tail call <16 x i32> @llvm.hexagon.V6.vlutvvb.oracc(<16 x i32> undef, <16 x i32> <i32 151388928, i32 353505036, i32 555621144, i32 757737252, i32 959853360, i32 1161969468, i32 1364085576, i32 1566201684, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <16 x i32> zeroinitializer, i32 3) +  %v8 = bitcast <16 x i32> %v7 to <64 x i8> +  %v9 = shufflevector <64 x i8> %v8, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> +  %v10 = shufflevector <32 x i8> %v9, <32 x i8> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> +  %v11 = bitcast <64 x i8> %v10 to <16 x i32> +  %v12 = tail call <16 x i32> @llvm.hexagon.V6.vshuffb(<16 x i32> %v11) +  store <16 x i32> %v12, <16 x i32>* undef, align 1 +  unreachable +} + +declare <16 x i32> @llvm.hexagon.V6.vshuffb(<16 x i32>) #1 +declare <16 x i32> @llvm.hexagon.V6.vlutvvb.oracc(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1 + +attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-length64b" } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/Hexagon/expand-vstorerw-undef.ll b/test/CodeGen/Hexagon/expand-vstorerw-undef.ll index 88eaec938fd3..5ac0f59bd2d1 100644 --- a/test/CodeGen/Hexagon/expand-vstorerw-undef.ll +++ b/test/CodeGen/Hexagon/expand-vstorerw-undef.ll @@ -12,7 +12,7 @@  ; CHECK-LABEL: fred:  ; CHECK: v[[REG:[0-9]+]] = vsplat -; CHECK: vmem(r29+#6) = v[[REG]] +; CHECK: vmem(r29+#{{[0-9]+}}) = v[[REG]]  target triple = "hexagon" diff --git a/test/CodeGen/Hexagon/v60-cur.ll b/test/CodeGen/Hexagon/v60-cur.ll index 26d40c9a6975..d0ffe1d8fdd8 100644 --- a/test/CodeGen/Hexagon/v60-cur.ll +++ b/test/CodeGen/Hexagon/v60-cur.ll @@ -1,9 +1,8 @@ -; RUN: llc -march=hexagon -enable-pipeliner=false < %s | FileCheck %s +; RUN: llc -march=hexagon < %s | FileCheck %s  ; Test that we generate a .cur -; CHECK: v{{[0-9]*}}.cur{{ *}} -; CHECK: v{{[0-9]*}}.cur{{ *}} +; CHECK: v{{[0-9]*}}.cur  define void @conv3x3_i(i8* noalias nocapture readonly %iptr0, i32 %shift, i32 %width) #0 {  entry: diff --git a/test/CodeGen/Hexagon/vect/vect-extract-i1-debug.ll b/test/CodeGen/Hexagon/vect/vect-extract-i1-debug.ll new file mode 100644 index 000000000000..af2a55ea47d5 --- /dev/null +++ b/test/CodeGen/Hexagon/vect/vect-extract-i1-debug.ll @@ -0,0 +1,14 @@ +; RUN: llc -march=hexagon -debug-only=isel < %s 2>/dev/null +; REQUIRES: asserts + +; Make sure that this doesn't crash. Debug option enabled a failing assertion +; about type mismatch in formal arguments. +; CHECK: vaddub + +define i1 @t_i4x8(<4 x i8> %a, <4 x i8> %b) nounwind { +entry: +	%0 = add <4 x i8> %a, %b +        %1 = bitcast <4 x i8> %0 to <32 x i1> +        %2 = extractelement <32 x i1> %1, i32 0 +	ret i1 %2 +} diff --git a/test/CodeGen/Hexagon/vect/vect-infloop.ll b/test/CodeGen/Hexagon/vect/vect-infloop.ll index 4de390159fdd..9ee0b0ab3aa6 100644 --- a/test/CodeGen/Hexagon/vect/vect-infloop.ll +++ b/test/CodeGen/Hexagon/vect/vect-infloop.ll @@ -1,10 +1,10 @@  ; Extracted from test/CodeGen/Generic/vector-casts.ll: used to loop indefinitely.  ; RUN: llc -march=hexagon < %s | FileCheck %s -; CHECK: combine +; CHECK: convert_df2w  define void @a(<2 x double>* %p, <2 x i8>* %q) { -        %t = load <2 x double>, <2 x double>* %p -	%r = fptosi <2 x double> %t to <2 x i8> -        store <2 x i8> %r, <2 x i8>* %q -	ret void +  %t = load <2 x double>, <2 x double>* %p +  %r = fptosi <2 x double> %t to <2 x i8> +  store <2 x i8> %r, <2 x i8>* %q +  ret void  } diff --git a/test/CodeGen/Mips/llvm-ir/extractelement.ll b/test/CodeGen/Mips/llvm-ir/extractelement.ll index f7b8ea5f9e15..4f926cbee0b2 100644 --- a/test/CodeGen/Mips/llvm-ir/extractelement.ll +++ b/test/CodeGen/Mips/llvm-ir/extractelement.ll @@ -15,5 +15,5 @@ define i1 @via_stack_bug(i8 signext %idx) {  ; ALL-DAG:       sh     [[ONE]], 6($sp)  ; ALL-DAG:       andi   [[MASKED_IDX:\$[0-9]+]], $4, 1  ; ALL-DAG:       addiu  [[VPTR:\$[0-9]+]], $sp, 6 -; ALL-DAG:       or   [[EPTR:\$[0-9]+]], [[MASKED_IDX]], [[VPTR]] +; ALL-DAG:       or   [[EPTR:\$[0-9]+]], [[VPTR]], [[MASKED_IDX]]  ; ALL:           lbu    $2, 0([[EPTR]]) diff --git a/test/CodeGen/Mips/long-call-mcount.ll b/test/CodeGen/Mips/long-call-mcount.ll new file mode 100644 index 000000000000..70a4410d060b --- /dev/null +++ b/test/CodeGen/Mips/long-call-mcount.ll @@ -0,0 +1,19 @@ +; Check call to mcount in case of long/short call options. +; RUN: llc -march=mips -target-abi o32 --mattr=+long-calls,+noabicalls < %s \ +; RUN:   | FileCheck -check-prefixes=CHECK,LONG %s +; RUN: llc -march=mips -target-abi o32 --mattr=-long-calls,+noabicalls < %s \ +; RUN:   | FileCheck -check-prefixes=CHECK,SHORT %s + +; Function Attrs: noinline nounwind optnone +define void @foo() #0 { +entry: +  ret void + +; CHECK-LABEL: foo +; LONG:          lui     $1, %hi(_mcount) +; LONG-NEXT:     addiu   $25, $1, %lo(_mcount) +; LONG-NEXT:     jalr    $25 +; SHORT:         jal     _mcount +} + +attributes #0 = { "instrument-function-entry-inlined"="_mcount" } diff --git a/test/CodeGen/Mips/sll-micromips-r6-encoding.mir b/test/CodeGen/Mips/sll-micromips-r6-encoding.mir new file mode 100644 index 000000000000..85ce251ac315 --- /dev/null +++ b/test/CodeGen/Mips/sll-micromips-r6-encoding.mir @@ -0,0 +1,46 @@ +# RUN: llc -march=mips -mcpu=mips32r6 -mattr=+micromips %s -start-after=xray-instrumentation -o - -show-mc-encoding | FileCheck %s + +# Test that the 'sll $zero, $zero, 0' is correctly recognized as a real +# instruction rather than some unimplemented opcode for the purposes of +# encoding an instruction. + +# CHECK-LABEL: a: +# CHECK:  nop                           # encoding: [0x00,0x00,0x00,0x00] +# CHECK:  jrc   $ra                     # encoding: [0x45,0xbf] +--- +name:            a +alignment:       2 +exposesReturnsTwice: false +legalized:       false +regBankSelected: false +selected:        false +tracksRegLiveness: false +registers: +liveins: +  - { reg: '%a0', virtual-reg: '' } +frameInfo: +  isFrameAddressTaken: false +  isReturnAddressTaken: false +  hasStackMap:     false +  hasPatchPoint:   false +  stackSize:       0 +  offsetAdjustment: 0 +  maxAlignment:    1 +  adjustsStack:    false +  hasCalls:        false +  stackProtector:  '' +  maxCallFrameSize: 0 +  hasOpaqueSPAdjustment: false +  hasVAStart:      false +  hasMustTailInVarArgFunc: false +  savePoint:       '' +  restorePoint:    '' +fixedStack: +stack: +constants: +body:             | +  bb.0.entry: +    renamable %zero = SLL_MMR6 killed renamable %zero, 0 +    JRC16_MM undef %ra, implicit %v0 + +... diff --git a/test/CodeGen/PowerPC/cmp_elimination.ll b/test/CodeGen/PowerPC/cmp_elimination.ll index 3251ae2881b9..6bc8b8a041c2 100644 --- a/test/CodeGen/PowerPC/cmp_elimination.ll +++ b/test/CodeGen/PowerPC/cmp_elimination.ll @@ -1,4 +1,3 @@ -; XFAIL: *  ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s  ; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s @@ -748,6 +747,37 @@ do.end:    ret void  } +define void @func29(i32 signext %a) { +; We cannot merge two compares due to difference in sign extension behaviors. +; equivalent C code example: +;   int a = .. ; +;   if (a == -1) dummy1(); +;   if (a == (uint16_t)-1) dummy2(); + +; CHECK-LABEL: @func29 +; CHECK: cmp +; CHECK: cmp +; CHECK: blr +entry: +  %cmp = icmp eq i32 %a, -1 +  br i1 %cmp, label %if.then, label %if.else + +if.then: +  tail call void @dummy1() +  br label %if.end3 + +if.else: +  %cmp1 = icmp eq i32 %a, 65535 +  br i1 %cmp1, label %if.then2, label %if.end3 + +if.then2: +  tail call void @dummy2() +  br label %if.end3 + +if.end3: +  ret void +} +  declare void @dummy1()  declare void @dummy2()  declare void @dummy3() diff --git a/test/CodeGen/PowerPC/uint-to-ppcfp128-crash.ll b/test/CodeGen/PowerPC/uint-to-ppcfp128-crash.ll new file mode 100644 index 000000000000..ad8dd90ea920 --- /dev/null +++ b/test/CodeGen/PowerPC/uint-to-ppcfp128-crash.ll @@ -0,0 +1,15 @@ +; RUN: llc -verify-machineinstrs -mcpu=pwr9 \ +; RUN:   -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s + +; Ensure we don't crash by trying to convert directly from a subword load +; to a ppc_fp128 as we do for conversions to f32/f64. +define ppc_fp128 @test(i16* nocapture readonly %Ptr) { +entry: +  %0 = load i16, i16* %Ptr, align 2 +  %conv = uitofp i16 %0 to ppc_fp128 +  ret ppc_fp128 %conv +; CHECK: lhz [[LD:[0-9]+]], 0(3) +; CHECK: mtvsrwa [[MV:[0-9]+]], [[LD]] +; CHECK: xscvsxddp [[CONV:[0-9]+]], [[MV]] +; CHECK: bl __gcc_qadd +} diff --git a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll index 82c6c318abdc..247961e85b12 100644 --- a/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll +++ b/test/CodeGen/PowerPC/variable_elem_vec_extracts.ll @@ -25,7 +25,7 @@ entry:  ; CHECK: extsw 3, [[RSHREG]]  ; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 2, 28, 29  ; CHECK-P7-DAG: stxvw4x 34, -; CHECK-P7: lwax 3, [[ELEMOFFREG]], +; CHECK-P7: lwax 3, 3, [[ELEMOFFREG]]  ; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 2  ; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 2  ; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]] @@ -54,7 +54,7 @@ entry:  ; CHECK: mfvsrd 3,  ; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 3, 28, 28  ; CHECK-P7-DAG: stxvd2x 34, -; CHECK-P7: ldx 3, [[ELEMOFFREG]], +; CHECK-P7: ldx 3, 3, [[ELEMOFFREG]]  ; CHECK-BE-DAG: andi. [[ANDREG:[0-9]+]], 5, 1  ; CHECK-BE-DAG: sldi [[SLREG:[0-9]+]], [[ANDREG]], 3  ; CHECK-BE-DAG: lvsl [[SHMSKREG:[0-9]+]], 0, [[SLREG]] @@ -77,7 +77,7 @@ entry:  ; CHECK: xscvspdpn 1,  ; CHECK-P7-DAG: rlwinm [[ELEMOFFREG:[0-9]+]], 5, 2, 28, 29  ; CHECK-P7-DAG: stxvw4x 34, -; CHECK-P7: lfsx 1, [[ELEMOFFREG]], +; CHECK-P7: lfsx 1, 3, [[ELEMOFFREG]]  ; CHECK-BE: sldi [[ELNOREG:[0-9]+]], 5, 2  ; CHECK-BE: lvsl [[SHMSKREG:[0-9]+]], 0, [[ELNOREG]]  ; CHECK-BE: vperm {{[0-9]+}}, 2, 2, [[SHMSKREG]] diff --git a/test/CodeGen/Thumb2/t2sizereduction.mir b/test/CodeGen/Thumb2/t2sizereduction.mir new file mode 100644 index 000000000000..377c0ccc7b0a --- /dev/null +++ b/test/CodeGen/Thumb2/t2sizereduction.mir @@ -0,0 +1,83 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=t2-reduce-size %s -o - | FileCheck %s + +--- | +  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +  target triple = "thumbv8m.main-arm-none-eabi" + +  ; Function Attrs: norecurse nounwind readnone +  define i32 @test(i32 %x, i32 %y) local_unnamed_addr #0 { +  entry: +    %cmp6 = icmp sgt i32 %y, 0 +    br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +  for.body.preheader:                               ; preds = %entry +    br label %for.body + +  for.cond.cleanup:                                 ; preds = %for.body, %entry +    %sum.0.lcssa = phi i32 [ 1, %entry ], [ %mul, %for.body ] +    ret i32 %sum.0.lcssa + +  for.body:                                         ; preds = %for.body, %for.body.preheader +    %lsr.iv1 = phi i32 [ %lsr.iv.next2, %for.body ], [ %x, %for.body.preheader ] +    %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %y, %for.body.preheader ] +    %sum.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ] +    %mul = mul nsw i32 %lsr.iv1, %sum.07 +    %lsr.iv.next = add i32 %lsr.iv, -1 +    %lsr.iv.next2 = add i32 %lsr.iv1, 1 +    %exitcond = icmp eq i32 %lsr.iv.next, 0 +    br i1 %exitcond, label %for.cond.cleanup, label %for.body +  } + +  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="+d16,+dsp,+fp-armv8,+fp-only-sp,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" } + +... +--- +name:            test +tracksRegLiveness: true +liveins: +  - { reg: '%r0', virtual-reg: '' } +  - { reg: '%r1', virtual-reg: '' } +body:             | +  ; CHECK-LABEL: name: test +  ; CHECK: bb.0.entry: +  ; CHECK:   successors: %bb.1(0x40000000), %bb.2(0x40000000) +  ; CHECK:   liveins: %r0, %r1 +  ; CHECK:   %r2 = tMOVr %r0, 14, %noreg +  ; CHECK:   %r0, dead %cpsr = tMOVi8 1, 14, %noreg +  ; CHECK:   tCMPi8 %r1, 1, 14, %noreg, implicit-def %cpsr +  ; CHECK:   t2Bcc %bb.2, 11, killed %cpsr +  ; CHECK: bb.1.for.body: +  ; CHECK:   successors: %bb.2(0x40000000), %bb.1(0x40000000) +  ; CHECK:   liveins: %r0, %r1, %r2 +  ; CHECK:   %r0, dead %cpsr = tMUL %r2, killed %r0, 14, %noreg +  ; CHECK:   %r2, dead %cpsr = tADDi8 killed %r2, 1, 14, %noreg +  ; CHECK:   %r1, %cpsr = tSUBi8 killed %r1, 1, 14, %noreg +  ; CHECK:   t2Bcc %bb.1, 1, killed %cpsr +  ; CHECK: bb.2.for.cond.cleanup: +  ; CHECK:   liveins: %r0 +  ; CHECK:   tBX_RET 14, %noreg, implicit %r0 +  bb.0.entry: +    successors: %bb.1.for.body, %bb.2.for.cond.cleanup +    liveins: %r0, %r1 + +    %r2 = tMOVr %r0, 14, _ +    %r0 = t2MOVi 1, 14, _, _ +    t2CMPri %r1, 1, 14, _, implicit-def %cpsr +    t2Bcc %bb.2.for.cond.cleanup, 11, killed %cpsr + +  bb.1.for.body: +    successors: %bb.2.for.cond.cleanup, %bb.1.for.body +    liveins: %r0, %r1, %r2 + +    %r0 = t2MUL %r2, killed %r0, 14, _ +    %r2 = t2ADDri killed %r2, 1, 14, _, _ +    %r1 = t2SUBri killed %r1, 1, 14, _, def %cpsr +    t2Bcc %bb.1.for.body, 1, killed %cpsr + +  bb.2.for.cond.cleanup: +    liveins: %r0 + +    tBX_RET 14, _, implicit %r0 + +... diff --git a/test/CodeGen/X86/avg-mask.ll b/test/CodeGen/X86/avg-mask.ll index 578d7aa75287..d32b0e70791a 100644 --- a/test/CodeGen/X86/avg-mask.ll +++ b/test/CodeGen/X86/avg-mask.ll @@ -143,16 +143,8 @@ define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64  ; AVX512F-NEXT:    shrq $32, %rax  ; AVX512F-NEXT:    movl %eax, {{[0-9]+}}(%rsp)  ; AVX512F-NEXT:    movl %edi, (%rsp) -; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm6 -; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm8 -; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm7 -; AVX512F-NEXT:    vpavgb %xmm7, %xmm6, %xmm6 -; AVX512F-NEXT:    vextracti128 $1, %ymm3, %xmm7 -; AVX512F-NEXT:    vpavgb %xmm7, %xmm8, %xmm7 -; AVX512F-NEXT:    vpavgb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT:    vinserti128 $1, %xmm7, %ymm1, %ymm1 -; AVX512F-NEXT:    vpavgb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0 +; AVX512F-NEXT:    vpavgb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT:    vpavgb %ymm2, %ymm0, %ymm0  ; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1  ; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2  ; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} @@ -201,16 +193,8 @@ define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwin  ; AVX512F-NEXT:    shrq $32, %rax  ; AVX512F-NEXT:    movl %eax, {{[0-9]+}}(%rsp)  ; AVX512F-NEXT:    movl %edi, (%rsp) -; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm4 -; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm5 -; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm6 -; AVX512F-NEXT:    vpavgb %xmm6, %xmm4, %xmm4 -; AVX512F-NEXT:    vextracti128 $1, %ymm3, %xmm6 -; AVX512F-NEXT:    vpavgb %xmm6, %xmm5, %xmm5 -; AVX512F-NEXT:    vpavgb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1 -; AVX512F-NEXT:    vpavgb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX512F-NEXT:    vpavgb %ymm3, %ymm1, %ymm1 +; AVX512F-NEXT:    vpavgb %ymm2, %ymm0, %ymm0  ; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1  ; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2  ; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index d1e26b787f48..dd11f6ca2935 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -90,150 +90,23 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind {  define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {  ; SSE2-LABEL: avg_v32i8:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    movdqa (%rdi), %xmm3 -; SSE2-NEXT:    movdqa 16(%rdi), %xmm8 -; SSE2-NEXT:    movdqa (%rsi), %xmm0 -; SSE2-NEXT:    movdqa 16(%rsi), %xmm1 -; SSE2-NEXT:    pxor %xmm4, %xmm4 -; SSE2-NEXT:    movdqa %xmm3, %xmm5 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE2-NEXT:    movdqa %xmm5, %xmm6 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT:    movdqa %xmm3, %xmm12 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT:    movdqa %xmm8, %xmm7 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; SSE2-NEXT:    movdqa %xmm7, %xmm11 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT:    movdqa %xmm8, %xmm10 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT:    movdqa %xmm0, %xmm2 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT:    movdqa %xmm2, %xmm9 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT:    paddd %xmm6, %xmm9 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT:    paddd %xmm5, %xmm2 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT:    movdqa %xmm0, %xmm5 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT:    paddd %xmm12, %xmm5 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT:    paddd %xmm3, %xmm0 -; SSE2-NEXT:    movdqa %xmm1, %xmm3 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT:    movdqa %xmm3, %xmm6 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT:    paddd %xmm11, %xmm6 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT:    paddd %xmm7, %xmm3 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT:    movdqa %xmm1, %xmm7 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT:    paddd %xmm10, %xmm7 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT:    paddd %xmm8, %xmm1 -; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT:    psubd %xmm4, %xmm9 -; SSE2-NEXT:    psubd %xmm4, %xmm2 -; SSE2-NEXT:    psubd %xmm4, %xmm5 -; SSE2-NEXT:    psubd %xmm4, %xmm0 -; SSE2-NEXT:    psubd %xmm4, %xmm6 -; SSE2-NEXT:    psubd %xmm4, %xmm3 -; SSE2-NEXT:    psubd %xmm4, %xmm7 -; SSE2-NEXT:    psubd %xmm4, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm7 -; SSE2-NEXT:    psrld $1, %xmm3 -; SSE2-NEXT:    psrld $1, %xmm6 -; SSE2-NEXT:    psrld $1, %xmm0 -; SSE2-NEXT:    psrld $1, %xmm5 -; SSE2-NEXT:    psrld $1, %xmm2 -; SSE2-NEXT:    psrld $1, %xmm9 -; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT:    pand %xmm4, %xmm9 -; SSE2-NEXT:    pand %xmm4, %xmm2 -; SSE2-NEXT:    packuswb %xmm9, %xmm2 -; SSE2-NEXT:    pand %xmm4, %xmm5 -; SSE2-NEXT:    pand %xmm4, %xmm0 -; SSE2-NEXT:    packuswb %xmm5, %xmm0 -; SSE2-NEXT:    packuswb %xmm2, %xmm0 -; SSE2-NEXT:    pand %xmm4, %xmm6 -; SSE2-NEXT:    pand %xmm4, %xmm3 -; SSE2-NEXT:    packuswb %xmm6, %xmm3 -; SSE2-NEXT:    pand %xmm4, %xmm7 -; SSE2-NEXT:    pand %xmm4, %xmm1 -; SSE2-NEXT:    packuswb %xmm7, %xmm1 -; SSE2-NEXT:    packuswb %xmm3, %xmm1 -; SSE2-NEXT:    movdqu %xmm1, (%rax) +; SSE2-NEXT:    movdqa 16(%rdi), %xmm0 +; SSE2-NEXT:    movdqa (%rsi), %xmm1 +; SSE2-NEXT:    pavgb (%rdi), %xmm1 +; SSE2-NEXT:    pavgb 16(%rsi), %xmm0  ; SSE2-NEXT:    movdqu %xmm0, (%rax) +; SSE2-NEXT:    movdqu %xmm1, (%rax)  ; SSE2-NEXT:    retq  ;  ; AVX1-LABEL: avg_v32i8:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm0, %xmm9 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm1, %xmm1 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm3, %xmm3 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm5, %xmm5 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm8, %xmm7 -; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT:    vpsubd %xmm0, %xmm9, %xmm8 -; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm1 -; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm2 -; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm3 -; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm4 -; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm5 -; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm6 -; AVX1-NEXT:    vpsubd %xmm0, %xmm7, %xmm0 -; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm9 -; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm6 -; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm5 -; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm7 -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT:    vpand %xmm0, %xmm7, %xmm7 -; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm1 -; AVX1-NEXT:    vpackuswb %xmm7, %xmm1, %xmm1 -; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm2 -; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm3 -; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT:    vpand %xmm0, %xmm4, %xmm2 -; AVX1-NEXT:    vpand %xmm0, %xmm5, %xmm3 -; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpand %xmm0, %xmm6, %xmm3 -; AVX1-NEXT:    vpand %xmm0, %xmm9, %xmm0 -; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX1-NEXT:    vmovdqa (%rsi), %ymm1 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT:    vpavgb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT:    vpavgb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0  ; AVX1-NEXT:    vmovups %ymm0, (%rax)  ; AVX1-NEXT:    vzeroupper  ; AVX1-NEXT:    retq @@ -265,444 +138,467 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {    ret void  } -define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind { -; SSE2-LABEL: avg_v64i8: +define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { +; SSE2-LABEL: avg_v48i8:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    movdqa (%rdi), %xmm6 -; SSE2-NEXT:    movdqa 16(%rdi), %xmm2 -; SSE2-NEXT:    movdqa 32(%rdi), %xmm1 -; SSE2-NEXT:    movdqa 48(%rdi), %xmm0 -; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT:    movdqa (%rsi), %xmm5 +; SSE2-NEXT:    movdqa (%rdi), %xmm1 +; SSE2-NEXT:    movdqa 16(%rdi), %xmm6 +; SSE2-NEXT:    movdqa 32(%rdi), %xmm11 +; SSE2-NEXT:    movdqa (%rsi), %xmm12  ; SSE2-NEXT:    movdqa 16(%rsi), %xmm13 -; SSE2-NEXT:    movdqa 32(%rsi), %xmm11 -; SSE2-NEXT:    pxor %xmm0, %xmm0 -; SSE2-NEXT:    movdqa %xmm6, %xmm4 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT:    movdqa %xmm4, %xmm7 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm6, %xmm12 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT:    movdqa %xmm2, %xmm15 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; SSE2-NEXT:    movdqa %xmm15, %xmm14 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm2, %xmm8 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT:    movdqa %xmm5, %xmm10 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] -; SSE2-NEXT:    movdqa %xmm10, %xmm3 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT:    paddd %xmm7, %xmm3 -; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT:    movdqa %xmm1, %xmm7 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm4, %xmm10 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm5, %xmm3 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT:    paddd %xmm12, %xmm3 -; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm6, %xmm5 -; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT:    movdqa %xmm13, %xmm4 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT:    movdqa %xmm4, %xmm12 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT:    paddd %xmm14, %xmm12 -; SSE2-NEXT:    movdqa %xmm7, %xmm5 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm15, %xmm4 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm13, %xmm15 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE2-NEXT:    paddd %xmm8, %xmm15 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm2, %xmm13 -; SSE2-NEXT:    movdqa %xmm11, %xmm6 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT:    movdqa %xmm6, %xmm9 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE2-NEXT:    paddd %xmm5, %xmm9 -; SSE2-NEXT:    movdqa %xmm1, %xmm2 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm7, %xmm6 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm11, %xmm14 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT:    paddd %xmm2, %xmm14 -; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; SSE2-NEXT:    movdqa %xmm5, %xmm2 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm1, %xmm11 -; SSE2-NEXT:    movdqa %xmm2, %xmm1 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT:    movdqa 48(%rsi), %xmm7 -; SSE2-NEXT:    movdqa %xmm7, %xmm3 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT:    movdqa 32(%rsi), %xmm0 +; SSE2-NEXT:    pxor %xmm7, %xmm7 +; SSE2-NEXT:    movdqa %xmm1, %xmm4 +; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; SSE2-NEXT:    movdqa %xmm4, %xmm2 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE2-NEXT:    movdqa %xmm1, %xmm10 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE2-NEXT:    movdqa %xmm6, %xmm5 +; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] +; SSE2-NEXT:    movdqa %xmm5, %xmm15 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; SSE2-NEXT:    movdqa %xmm6, %xmm14 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT:    movdqa %xmm12, %xmm3 +; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15]  ; SSE2-NEXT:    movdqa %xmm3, %xmm8 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] -; SSE2-NEXT:    paddd %xmm1, %xmm8 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm2, %xmm3 -; SSE2-NEXT:    movdqa %xmm5, %xmm2 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm2, %xmm1 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm7, %xmm5 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT:    paddd %xmm1, %xmm5 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm2, %xmm7 -; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT:    psubd %xmm0, %xmm1 -; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT:    psubd %xmm0, %xmm10 -; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT:    psubd %xmm0, %xmm1 -; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT:    psubd %xmm0, %xmm2 -; SSE2-NEXT:    psubd %xmm0, %xmm12 -; SSE2-NEXT:    psubd %xmm0, %xmm4 -; SSE2-NEXT:    psubd %xmm0, %xmm15 -; SSE2-NEXT:    psubd %xmm0, %xmm13 -; SSE2-NEXT:    psubd %xmm0, %xmm9 -; SSE2-NEXT:    psubd %xmm0, %xmm6 -; SSE2-NEXT:    psubd %xmm0, %xmm14 -; SSE2-NEXT:    psubd %xmm0, %xmm11 -; SSE2-NEXT:    psubd %xmm0, %xmm8 -; SSE2-NEXT:    psubd %xmm0, %xmm3 -; SSE2-NEXT:    psubd %xmm0, %xmm5 -; SSE2-NEXT:    psubd %xmm0, %xmm7 -; SSE2-NEXT:    psrld $1, %xmm10 -; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT:    psrld $1, %xmm1 -; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT:    pand %xmm0, %xmm1 -; SSE2-NEXT:    pand %xmm0, %xmm10 -; SSE2-NEXT:    packuswb %xmm1, %xmm10 -; SSE2-NEXT:    psrld $1, %xmm2 -; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT:    psrld $1, %xmm1 -; SSE2-NEXT:    pand %xmm0, %xmm1 -; SSE2-NEXT:    pand %xmm0, %xmm2 -; SSE2-NEXT:    packuswb %xmm1, %xmm2 -; SSE2-NEXT:    packuswb %xmm10, %xmm2 -; SSE2-NEXT:    movdqa %xmm2, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm4 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE2-NEXT:    paddd %xmm2, %xmm8 +; SSE2-NEXT:    movdqa %xmm11, %xmm2 +; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; SSE2-NEXT:    paddd %xmm4, %xmm3 +; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3],xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] +; SSE2-NEXT:    movdqa %xmm12, %xmm9 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; SSE2-NEXT:    paddd %xmm10, %xmm9 +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] +; SSE2-NEXT:    paddd %xmm1, %xmm12 +; SSE2-NEXT:    movdqa %xmm13, %xmm4 +; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; SSE2-NEXT:    movdqa %xmm4, %xmm10 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE2-NEXT:    paddd %xmm15, %xmm10 +; SSE2-NEXT:    movdqa %xmm2, %xmm15 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7] +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3],xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE2-NEXT:    paddd %xmm5, %xmm4 +; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7] +; SSE2-NEXT:    movdqa %xmm13, %xmm1 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE2-NEXT:    paddd %xmm14, %xmm1 +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3] +; SSE2-NEXT:    paddd %xmm6, %xmm13 +; SSE2-NEXT:    movdqa %xmm0, %xmm6 +; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; SSE2-NEXT:    movdqa %xmm6, %xmm14 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7] +; SSE2-NEXT:    paddd %xmm15, %xmm14 +; SSE2-NEXT:    movdqa %xmm11, %xmm5 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT:    paddd %xmm2, %xmm6 +; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE2-NEXT:    movdqa %xmm0, %xmm2 +; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; SSE2-NEXT:    paddd %xmm5, %xmm2 +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE2-NEXT:    paddd %xmm11, %xmm0 +; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5 +; SSE2-NEXT:    psubd %xmm5, %xmm8 +; SSE2-NEXT:    psubd %xmm5, %xmm3 +; SSE2-NEXT:    psubd %xmm5, %xmm9 +; SSE2-NEXT:    psubd %xmm5, %xmm12 +; SSE2-NEXT:    psubd %xmm5, %xmm10 +; SSE2-NEXT:    psubd %xmm5, %xmm4 +; SSE2-NEXT:    psubd %xmm5, %xmm1 +; SSE2-NEXT:    psubd %xmm5, %xmm13 +; SSE2-NEXT:    psubd %xmm5, %xmm14 +; SSE2-NEXT:    psubd %xmm5, %xmm6 +; SSE2-NEXT:    psubd %xmm5, %xmm2 +; SSE2-NEXT:    psubd %xmm5, %xmm0 +; SSE2-NEXT:    psrld $1, %xmm3 +; SSE2-NEXT:    psrld $1, %xmm8 +; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255] +; SSE2-NEXT:    pand %xmm7, %xmm8 +; SSE2-NEXT:    pand %xmm7, %xmm3 +; SSE2-NEXT:    packuswb %xmm8, %xmm3  ; SSE2-NEXT:    psrld $1, %xmm12 -; SSE2-NEXT:    pand %xmm0, %xmm12 -; SSE2-NEXT:    pand %xmm0, %xmm4 -; SSE2-NEXT:    packuswb %xmm12, %xmm4 +; SSE2-NEXT:    psrld $1, %xmm9 +; SSE2-NEXT:    pand %xmm7, %xmm9 +; SSE2-NEXT:    pand %xmm7, %xmm12 +; SSE2-NEXT:    packuswb %xmm9, %xmm12 +; SSE2-NEXT:    packuswb %xmm3, %xmm12 +; SSE2-NEXT:    psrld $1, %xmm4 +; SSE2-NEXT:    psrld $1, %xmm10 +; SSE2-NEXT:    pand %xmm7, %xmm10 +; SSE2-NEXT:    pand %xmm7, %xmm4 +; SSE2-NEXT:    packuswb %xmm10, %xmm4  ; SSE2-NEXT:    psrld $1, %xmm13 -; SSE2-NEXT:    psrld $1, %xmm15 -; SSE2-NEXT:    pand %xmm0, %xmm15 -; SSE2-NEXT:    pand %xmm0, %xmm13 -; SSE2-NEXT:    packuswb %xmm15, %xmm13 +; SSE2-NEXT:    psrld $1, %xmm1 +; SSE2-NEXT:    pand %xmm7, %xmm1 +; SSE2-NEXT:    pand %xmm7, %xmm13 +; SSE2-NEXT:    packuswb %xmm1, %xmm13  ; SSE2-NEXT:    packuswb %xmm4, %xmm13  ; SSE2-NEXT:    psrld $1, %xmm6 -; SSE2-NEXT:    psrld $1, %xmm9 -; SSE2-NEXT:    pand %xmm0, %xmm9 -; SSE2-NEXT:    pand %xmm0, %xmm6 -; SSE2-NEXT:    packuswb %xmm9, %xmm6 -; SSE2-NEXT:    psrld $1, %xmm11  ; SSE2-NEXT:    psrld $1, %xmm14 -; SSE2-NEXT:    pand %xmm0, %xmm14 -; SSE2-NEXT:    pand %xmm0, %xmm11 -; SSE2-NEXT:    packuswb %xmm14, %xmm11 -; SSE2-NEXT:    packuswb %xmm6, %xmm11 -; SSE2-NEXT:    psrld $1, %xmm3 -; SSE2-NEXT:    psrld $1, %xmm8 -; SSE2-NEXT:    pand %xmm0, %xmm8 -; SSE2-NEXT:    pand %xmm0, %xmm3 -; SSE2-NEXT:    packuswb %xmm8, %xmm3 -; SSE2-NEXT:    psrld $1, %xmm7 -; SSE2-NEXT:    psrld $1, %xmm5 -; SSE2-NEXT:    pand %xmm0, %xmm5 -; SSE2-NEXT:    pand %xmm0, %xmm7 -; SSE2-NEXT:    packuswb %xmm5, %xmm7 -; SSE2-NEXT:    packuswb %xmm3, %xmm7 -; SSE2-NEXT:    movdqu %xmm7, (%rax) -; SSE2-NEXT:    movdqu %xmm11, (%rax) +; SSE2-NEXT:    pand %xmm7, %xmm14 +; SSE2-NEXT:    pand %xmm7, %xmm6 +; SSE2-NEXT:    packuswb %xmm14, %xmm6 +; SSE2-NEXT:    psrld $1, %xmm0 +; SSE2-NEXT:    psrld $1, %xmm2 +; SSE2-NEXT:    pand %xmm7, %xmm2 +; SSE2-NEXT:    pand %xmm7, %xmm0 +; SSE2-NEXT:    packuswb %xmm2, %xmm0 +; SSE2-NEXT:    packuswb %xmm6, %xmm0 +; SSE2-NEXT:    movdqu %xmm0, (%rax)  ; SSE2-NEXT:    movdqu %xmm13, (%rax) -; SSE2-NEXT:    movdqu %xmm1, (%rax) +; SSE2-NEXT:    movdqu %xmm12, (%rax)  ; SSE2-NEXT:    retq  ; -; AVX1-LABEL: avg_v64i8: +; AVX1-LABEL: avg_v48i8:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    subq $24, %rsp -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vmovdqa %xmm7, (%rsp) # 16-byte Spill -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm0, %xmm0 -; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm1, %xmm0 -; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm0 -; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm3, %xmm0 -; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm0 -; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm13 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm4, %xmm6, %xmm12 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm4, %xmm15, %xmm11 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm0, %xmm8, %xmm10 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm1, %xmm9, %xmm8 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm2, %xmm14, %xmm9 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd -{{[0-9]+}}(%rsp), %xmm3, %xmm4 # 16-byte Folded Reload -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd -{{[0-9]+}}(%rsp), %xmm5, %xmm3 # 16-byte Folded Reload -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd -{{[0-9]+}}(%rsp), %xmm5, %xmm2 # 16-byte Folded Reload -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm6, %xmm5, %xmm1 -; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm14 -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm5 -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm6 -; AVX1-NEXT:    vmovdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm6 -; AVX1-NEXT:    vmovdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm15 -; AVX1-NEXT:    vmovdqa %xmm15, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpsubd %xmm0, %xmm13, %xmm13 -; AVX1-NEXT:    vpsubd %xmm0, %xmm12, %xmm12 -; AVX1-NEXT:    vpsubd %xmm0, %xmm11, %xmm11 -; AVX1-NEXT:    vpsubd %xmm0, %xmm10, %xmm10 -; AVX1-NEXT:    vpsubd %xmm0, %xmm8, %xmm8 -; AVX1-NEXT:    vpsubd %xmm0, %xmm9, %xmm9 -; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm4 -; AVX1-NEXT:    vpsubd %xmm0, %xmm7, %xmm7 -; AVX1-NEXT:    vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm3 -; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm2 +; AVX1-NEXT:    vmovdqa (%rdi), %ymm2 +; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm5 +; AVX1-NEXT:    vmovdqa (%rsi), %ymm1 +; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm0 +; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm11 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[3,1,2,3] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm12 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm13 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm14 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[2,3,0,1] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero  ; AVX1-NEXT:    vmovdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm14, %xmm14 -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT:    vpand %xmm5, %xmm14, %xmm14 -; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1 -; AVX1-NEXT:    vpackuswb %xmm14, %xmm1, %xmm1 -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm6 -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2 -; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6 -; AVX1-NEXT:    vpackuswb %xmm2, %xmm6, %xmm2 -; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm13, %xmm2 -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm6 -; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6 -; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2 -; AVX1-NEXT:    vpackuswb %xmm6, %xmm2, %xmm2 -; AVX1-NEXT:    vpsrld $1, %xmm11, %xmm6 -; AVX1-NEXT:    vpsrld $1, %xmm12, %xmm7 -; AVX1-NEXT:    vpand %xmm5, %xmm7, %xmm7 -; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6 -; AVX1-NEXT:    vpackuswb %xmm7, %xmm6, %xmm6 -; AVX1-NEXT:    vpackuswb %xmm2, %xmm6, %xmm2 -; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm2 -; AVX1-NEXT:    vpsrld $1, %xmm10, %xmm6 -; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6 -; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2 -; AVX1-NEXT:    vpackuswb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm5[1,1,2,3] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT:    vmovdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[3,1,2,3] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT:    vpaddd %xmm5, %xmm7, %xmm5 +; AVX1-NEXT:    vmovdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[2,3,0,1] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm9 +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm8 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT:    vpaddd %xmm2, %xmm11, %xmm11 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[3,1,2,3] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX1-NEXT:    vpaddd %xmm7, %xmm12, %xmm12 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT:    vpaddd %xmm5, %xmm3, %xmm3 +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT:    vpaddd %xmm5, %xmm13, %xmm13 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT:    vpaddd %xmm1, %xmm15, %xmm15 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT:    vpaddd %xmm4, %xmm14, %xmm14 +; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX1-NEXT:    vpaddd %xmm6, %xmm10, %xmm6 +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT:    vpaddd -{{[0-9]+}}(%rsp), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT:    vpaddd -{{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm7, %xmm7 +; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; AVX1-NEXT:    vpsubd %xmm7, %xmm1, %xmm10 +; AVX1-NEXT:    vpsubd %xmm7, %xmm9, %xmm9 +; AVX1-NEXT:    vpsubd %xmm7, %xmm8, %xmm8 +; AVX1-NEXT:    vpsubd %xmm7, %xmm11, %xmm11 +; AVX1-NEXT:    vpsubd %xmm7, %xmm12, %xmm12 +; AVX1-NEXT:    vpsubd %xmm7, %xmm3, %xmm3 +; AVX1-NEXT:    vpsubd %xmm7, %xmm13, %xmm4 +; AVX1-NEXT:    vpsubd %xmm7, %xmm15, %xmm5 +; AVX1-NEXT:    vpsubd %xmm7, %xmm14, %xmm1 +; AVX1-NEXT:    vpsubd %xmm7, %xmm6, %xmm6 +; AVX1-NEXT:    vpsubd %xmm7, %xmm2, %xmm2 +; AVX1-NEXT:    vpsubd %xmm7, %xmm0, %xmm0 +; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0 +; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm14 +; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm15 +; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm13 +; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm5  ; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT:    vpsrld $1, %xmm9, %xmm6 -; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm6 -; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT:    vpackuswb %xmm6, %xmm4, %xmm4 -; AVX1-NEXT:    vpackuswb %xmm2, %xmm4, %xmm2  ; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3 +; AVX1-NEXT:    vpsrld $1, %xmm12, %xmm12 +; AVX1-NEXT:    vpsrld $1, %xmm11, %xmm11 +; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm7 +; AVX1-NEXT:    vpsrld $1, %xmm9, %xmm2 +; AVX1-NEXT:    vpsrld $1, %xmm10, %xmm6 +; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm6 +; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm2 +; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0] +; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm7 +; AVX1-NEXT:    vpshufb %xmm0, %xmm11, %xmm1 +; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm7[0],xmm1[0] +; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT:    vpshufb %xmm0, %xmm12, %xmm2 +; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm3 +; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm3 +; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm4 +; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm3 +; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT:    vpshufb %xmm0, %xmm13, %xmm2 +; AVX1-NEXT:    vpshufb %xmm0, %xmm15, %xmm3 +; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT:    vpshufb %xmm0, %xmm14, %xmm3  ; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0 -; AVX1-NEXT:    vpackuswb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT:    vmovups %ymm0, (%rax) +; AVX1-NEXT:    vpshufb %xmm0, %xmm4, %xmm0 +; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT:    vmovdqu %xmm0, (%rax)  ; AVX1-NEXT:    vmovups %ymm1, (%rax) -; AVX1-NEXT:    addq $24, %rsp  ; AVX1-NEXT:    vzeroupper  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: avg_v64i8: +; AVX2-LABEL: avg_v48i8:  ; AVX2:       # %bb.0: -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpaddd %ymm8, %ymm0, %ymm0 -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpaddd %ymm8, %ymm1, %ymm1 -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpaddd %ymm8, %ymm2, %ymm2 -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpaddd %ymm8, %ymm3, %ymm3 -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpaddd %ymm8, %ymm4, %ymm4 -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpaddd %ymm8, %ymm5, %ymm5 -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpaddd %ymm8, %ymm6, %ymm6 -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpaddd %ymm8, %ymm7, %ymm7 -; AVX2-NEXT:    vpcmpeqd %ymm8, %ymm8, %ymm8 -; AVX2-NEXT:    vpsubd %ymm8, %ymm0, %ymm9 -; AVX2-NEXT:    vpsubd %ymm8, %ymm1, %ymm10 -; AVX2-NEXT:    vpsubd %ymm8, %ymm2, %ymm2 -; AVX2-NEXT:    vpsubd %ymm8, %ymm3, %ymm3 -; AVX2-NEXT:    vpsubd %ymm8, %ymm4, %ymm4 -; AVX2-NEXT:    vpsubd %ymm8, %ymm5, %ymm5 -; AVX2-NEXT:    vpsubd %ymm8, %ymm6, %ymm1 -; AVX2-NEXT:    vpsubd %ymm8, %ymm7, %ymm0 -; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm11 -; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm12 +; AVX2-NEXT:    vmovdqa (%rdi), %ymm1 +; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm2 +; AVX2-NEXT:    vmovdqa (%rsi), %ymm3 +; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm0 +; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[2,3,0,1] +; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm4[3,1,2,3] +; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX2-NEXT:    vinserti128 $1, %xmm6, %ymm5, %ymm5 +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; AVX2-NEXT:    vpand %ymm9, %ymm5, %ymm5 +; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero +; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm11 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm10 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero +; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6 +; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm6[2,3,0,1] +; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm6[3,1,2,3] +; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX2-NEXT:    vpand %ymm9, %ymm2, %ymm2 +; AVX2-NEXT:    vpaddd %ymm2, %ymm5, %ymm2 +; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT:    vpaddd %ymm4, %ymm7, %ymm4 +; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX2-NEXT:    vpaddd %ymm3, %ymm11, %ymm3 +; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT:    vpaddd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT:    vpaddd %ymm0, %ymm10, %ymm0 +; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6 +; AVX2-NEXT:    vpsubd %ymm6, %ymm2, %ymm2 +; AVX2-NEXT:    vpsubd %ymm6, %ymm4, %ymm4 +; AVX2-NEXT:    vpsubd %ymm6, %ymm1, %ymm1 +; AVX2-NEXT:    vpsubd %ymm6, %ymm3, %ymm3 +; AVX2-NEXT:    vpsubd %ymm6, %ymm5, %ymm5 +; AVX2-NEXT:    vpsubd %ymm6, %ymm0, %ymm0 +; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2 +; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0  ; AVX2-NEXT:    vpsrld $1, %ymm5, %ymm5 +; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm3 +; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1  ; AVX2-NEXT:    vpsrld $1, %ymm4, %ymm4 -; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm6 -; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm7 -; AVX2-NEXT:    vpsrld $1, %ymm10, %ymm8 -; AVX2-NEXT:    vpsrld $1, %ymm9, %ymm3 -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm3[0,2,2,3] -; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT:    vpshufb %xmm3, %xmm9, %xmm0 -; AVX2-NEXT:    vpshufb %ymm2, %ymm8, %ymm8 -; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-NEXT:    vpshufb %xmm3, %xmm8, %xmm1 -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT:    vpshufb %ymm2, %ymm7, %ymm1 -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT:    vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm6[0],xmm1[0] -; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT:    vpshufb %ymm2, %ymm4, %ymm1 -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm4 +; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT:    vpshufb %ymm6, %ymm4, %ymm4  ; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-NEXT:    vpshufb %xmm7, %xmm4, %xmm4 +; AVX2-NEXT:    vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT:    vpshufb %xmm7, %xmm1, %xmm1  ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX2-NEXT:    vpshufb %ymm2, %ymm12, %ymm4 -; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-NEXT:    vpshufb %ymm2, %ymm11, %ymm2 +; AVX2-NEXT:    vpshufb %ymm6, %ymm2, %ymm2 +; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-NEXT:    vpshufb %ymm6, %ymm3, %ymm3 +; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-NEXT:    vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT:    vpshufb %ymm6, %ymm5, %ymm2  ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-NEXT:    vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT:    vpshufb %xmm7, %xmm0, %xmm0 +; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX2-NEXT:    vmovdqu %xmm0, (%rax)  ; AVX2-NEXT:    vmovdqu %ymm1, (%rax) +; AVX2-NEXT:    vzeroupper +; AVX2-NEXT:    retq +; +; AVX512F-LABEL: avg_v48i8: +; AVX512F:       # %bb.0: +; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT:    vmovdqa (%rsi), %ymm2 +; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm3 +; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm4 +; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm5 +; AVX512F-NEXT:    vpavgb %xmm5, %xmm4, %xmm4 +; AVX512F-NEXT:    vpavgb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX512F-NEXT:    vpavgb %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT:    vmovdqu %xmm1, (%rax) +; AVX512F-NEXT:    vmovdqu %ymm0, (%rax) +; AVX512F-NEXT:    vzeroupper +; AVX512F-NEXT:    retq +; +; AVX512BW-LABEL: avg_v48i8: +; AVX512BW:       # %bb.0: +; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm1 +; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm3 +; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero,xmm3[8],zero,zero,zero,xmm3[9],zero,zero,zero,xmm3[10],zero,zero,zero,xmm3[11],zero,zero,zero,xmm3[12],zero,zero,zero,xmm3[13],zero,zero,zero,xmm3[14],zero,zero,zero,xmm3[15],zero,zero,zero +; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0 +; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512BW-NEXT:    vpaddd %zmm4, %zmm2, %zmm2 +; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm4 +; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero +; AVX512BW-NEXT:    vpaddd %zmm4, %zmm3, %zmm3 +; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1 +; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512BW-NEXT:    vpsubd %zmm1, %zmm2, %zmm2 +; AVX512BW-NEXT:    vpsubd %zmm1, %zmm3, %zmm3 +; AVX512BW-NEXT:    vpsubd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT:    vpsrld $1, %zmm0, %zmm0 +; AVX512BW-NEXT:    vpsrld $1, %zmm3, %zmm1 +; AVX512BW-NEXT:    vpsrld $1, %zmm2, %zmm2 +; AVX512BW-NEXT:    vpmovdw %zmm2, %ymm2 +; AVX512BW-NEXT:    vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT:    vmovdqa %ymm0, %ymm0 +; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT:    vmovdqu %ymm1, (%rax) +; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, (%rax) +; AVX512BW-NEXT:    vzeroupper +; AVX512BW-NEXT:    retq +  %1 = load <48 x i8>, <48 x i8>* %a +  %2 = load <48 x i8>, <48 x i8>* %b +  %3 = zext <48 x i8> %1 to <48 x i32> +  %4 = zext <48 x i8> %2 to <48 x i32> +  %5 = add nuw nsw <48 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> +  %6 = add nuw nsw <48 x i32> %5, %4 +  %7 = lshr <48 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> +  %8 = trunc <48 x i32> %7 to <48 x i8> +  store <48 x i8> %8, <48 x i8>* undef, align 4 +  ret void +} + +define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind { +; SSE2-LABEL: avg_v64i8: +; SSE2:       # %bb.0: +; SSE2-NEXT:    movdqa 32(%rdi), %xmm0 +; SSE2-NEXT:    movdqa (%rsi), %xmm1 +; SSE2-NEXT:    movdqa 16(%rsi), %xmm2 +; SSE2-NEXT:    movdqa 48(%rsi), %xmm3 +; SSE2-NEXT:    pavgb (%rdi), %xmm1 +; SSE2-NEXT:    pavgb 16(%rdi), %xmm2 +; SSE2-NEXT:    pavgb 32(%rsi), %xmm0 +; SSE2-NEXT:    pavgb 48(%rdi), %xmm3 +; SSE2-NEXT:    movdqu %xmm3, (%rax) +; SSE2-NEXT:    movdqu %xmm0, (%rax) +; SSE2-NEXT:    movdqu %xmm2, (%rax) +; SSE2-NEXT:    movdqu %xmm1, (%rax) +; SSE2-NEXT:    retq +; +; AVX1-LABEL: avg_v64i8: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT:    vmovdqa (%rsi), %ymm2 +; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm3 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT:    vpavgb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT:    vpavgb %xmm0, %xmm2, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT:    vpavgb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT:    vpavgb %xmm1, %xmm3, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT:    vmovups %ymm1, (%rax) +; AVX1-NEXT:    vmovups %ymm0, (%rax) +; AVX1-NEXT:    vzeroupper +; AVX1-NEXT:    retq +; +; AVX2-LABEL: avg_v64i8: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT:    vmovdqa (%rsi), %ymm1 +; AVX2-NEXT:    vpavgb (%rdi), %ymm1, %ymm1 +; AVX2-NEXT:    vpavgb 32(%rsi), %ymm0, %ymm0  ; AVX2-NEXT:    vmovdqu %ymm0, (%rax) +; AVX2-NEXT:    vmovdqu %ymm1, (%rax)  ; AVX2-NEXT:    vzeroupper  ; AVX2-NEXT:    retq  ;  ; AVX512F-LABEL: avg_v64i8:  ; AVX512F:       # %bb.0: -; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT:    vpaddd %zmm4, %zmm0, %zmm0 -; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT:    vpaddd %zmm4, %zmm1, %zmm1 -; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT:    vpaddd %zmm4, %zmm2, %zmm2 -; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT:    vpaddd %zmm4, %zmm3, %zmm3 -; AVX512F-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 -; AVX512F-NEXT:    vpsubd %zmm4, %zmm0, %zmm0 -; AVX512F-NEXT:    vpsubd %zmm4, %zmm1, %zmm1 -; AVX512F-NEXT:    vpsubd %zmm4, %zmm2, %zmm2 -; AVX512F-NEXT:    vpsubd %zmm4, %zmm3, %zmm3 -; AVX512F-NEXT:    vpsrld $1, %zmm3, %zmm3 -; AVX512F-NEXT:    vpsrld $1, %zmm2, %zmm2 -; AVX512F-NEXT:    vpsrld $1, %zmm1, %zmm1 -; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0 -; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT:    vpmovdb %zmm2, %xmm1 -; AVX512F-NEXT:    vpmovdb %zmm3, %xmm2 -; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512F-NEXT:    vmovdqu %ymm1, (%rax) +; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT:    vmovdqa (%rsi), %ymm1 +; AVX512F-NEXT:    vpavgb (%rdi), %ymm1, %ymm1 +; AVX512F-NEXT:    vpavgb 32(%rsi), %ymm0, %ymm0  ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax) +; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)  ; AVX512F-NEXT:    vzeroupper  ; AVX512F-NEXT:    retq  ; @@ -782,81 +678,23 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind {  define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {  ; SSE2-LABEL: avg_v16i16:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    movdqa (%rdi), %xmm2 -; SSE2-NEXT:    movdqa 16(%rdi), %xmm4 -; SSE2-NEXT:    movdqa (%rsi), %xmm0 -; SSE2-NEXT:    movdqa 16(%rsi), %xmm1 -; SSE2-NEXT:    pxor %xmm5, %xmm5 -; SSE2-NEXT:    movdqa %xmm2, %xmm6 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT:    movdqa %xmm4, %xmm7 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE2-NEXT:    movdqa %xmm0, %xmm3 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE2-NEXT:    paddd %xmm6, %xmm3 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT:    paddd %xmm2, %xmm0 -; SSE2-NEXT:    movdqa %xmm1, %xmm2 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE2-NEXT:    paddd %xmm7, %xmm2 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE2-NEXT:    paddd %xmm4, %xmm1 -; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT:    psubd %xmm4, %xmm3 -; SSE2-NEXT:    psubd %xmm4, %xmm0 -; SSE2-NEXT:    psubd %xmm4, %xmm2 -; SSE2-NEXT:    psubd %xmm4, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm2 -; SSE2-NEXT:    psrld $1, %xmm0 -; SSE2-NEXT:    psrld $1, %xmm3 -; SSE2-NEXT:    pslld $16, %xmm3 -; SSE2-NEXT:    psrad $16, %xmm3 -; SSE2-NEXT:    pslld $16, %xmm0 -; SSE2-NEXT:    psrad $16, %xmm0 -; SSE2-NEXT:    packssdw %xmm3, %xmm0 -; SSE2-NEXT:    pslld $16, %xmm2 -; SSE2-NEXT:    psrad $16, %xmm2 -; SSE2-NEXT:    pslld $16, %xmm1 -; SSE2-NEXT:    psrad $16, %xmm1 -; SSE2-NEXT:    packssdw %xmm2, %xmm1 -; SSE2-NEXT:    movdqu %xmm1, (%rax) +; SSE2-NEXT:    movdqa 16(%rdi), %xmm0 +; SSE2-NEXT:    movdqa (%rsi), %xmm1 +; SSE2-NEXT:    pavgw (%rdi), %xmm1 +; SSE2-NEXT:    pavgw 16(%rsi), %xmm0  ; SSE2-NEXT:    movdqu %xmm0, (%rax) +; SSE2-NEXT:    movdqu %xmm1, (%rax)  ; SSE2-NEXT:    retq  ;  ; AVX1-LABEL: avg_v16i16:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT:    vpsubd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT:    vpsubd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT:    vpsubd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] -; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] -; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] -; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] -; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX1-NEXT:    vmovdqa (%rsi), %ymm1 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT:    vpavgw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT:    vpavgw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0  ; AVX1-NEXT:    vmovups %ymm0, (%rax)  ; AVX1-NEXT:    vzeroupper  ; AVX1-NEXT:    retq @@ -891,207 +729,60 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {  define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind {  ; SSE2-LABEL: avg_v32i16:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    movdqa (%rdi), %xmm4 -; SSE2-NEXT:    movdqa 16(%rdi), %xmm11 -; SSE2-NEXT:    movdqa 32(%rdi), %xmm10 -; SSE2-NEXT:    movdqa 48(%rdi), %xmm8 -; SSE2-NEXT:    movdqa (%rsi), %xmm9 -; SSE2-NEXT:    movdqa 16(%rsi), %xmm1 -; SSE2-NEXT:    movdqa 32(%rsi), %xmm2 +; SSE2-NEXT:    movdqa 32(%rdi), %xmm0 +; SSE2-NEXT:    movdqa (%rsi), %xmm1 +; SSE2-NEXT:    movdqa 16(%rsi), %xmm2  ; SSE2-NEXT:    movdqa 48(%rsi), %xmm3 -; SSE2-NEXT:    pxor %xmm0, %xmm0 -; SSE2-NEXT:    movdqa %xmm4, %xmm6 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT:    movdqa %xmm11, %xmm5 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE2-NEXT:    movdqa %xmm10, %xmm12 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT:    movdqa %xmm8, %xmm13 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT:    movdqa %xmm9, %xmm7 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT:    paddd %xmm6, %xmm7 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm4, %xmm9 -; SSE2-NEXT:    movdqa %xmm1, %xmm6 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT:    paddd %xmm5, %xmm6 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm11, %xmm1 -; SSE2-NEXT:    movdqa %xmm2, %xmm5 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT:    paddd %xmm12, %xmm5 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm10, %xmm2 -; SSE2-NEXT:    movdqa %xmm3, %xmm4 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT:    paddd %xmm13, %xmm4 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm8, %xmm3 -; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT:    psubd %xmm0, %xmm7 -; SSE2-NEXT:    psubd %xmm0, %xmm9 -; SSE2-NEXT:    psubd %xmm0, %xmm6 -; SSE2-NEXT:    psubd %xmm0, %xmm1 -; SSE2-NEXT:    psubd %xmm0, %xmm5 -; SSE2-NEXT:    psubd %xmm0, %xmm2 -; SSE2-NEXT:    psubd %xmm0, %xmm4 -; SSE2-NEXT:    psubd %xmm0, %xmm3 -; SSE2-NEXT:    psrld $1, %xmm3 -; SSE2-NEXT:    psrld $1, %xmm4 -; SSE2-NEXT:    psrld $1, %xmm2 -; SSE2-NEXT:    psrld $1, %xmm5 -; SSE2-NEXT:    psrld $1, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm6 -; SSE2-NEXT:    psrld $1, %xmm9 -; SSE2-NEXT:    psrld $1, %xmm7 -; SSE2-NEXT:    pslld $16, %xmm7 -; SSE2-NEXT:    psrad $16, %xmm7 -; SSE2-NEXT:    pslld $16, %xmm9 -; SSE2-NEXT:    psrad $16, %xmm9 -; SSE2-NEXT:    packssdw %xmm7, %xmm9 -; SSE2-NEXT:    pslld $16, %xmm6 -; SSE2-NEXT:    psrad $16, %xmm6 -; SSE2-NEXT:    pslld $16, %xmm1 -; SSE2-NEXT:    psrad $16, %xmm1 -; SSE2-NEXT:    packssdw %xmm6, %xmm1 -; SSE2-NEXT:    pslld $16, %xmm5 -; SSE2-NEXT:    psrad $16, %xmm5 -; SSE2-NEXT:    pslld $16, %xmm2 -; SSE2-NEXT:    psrad $16, %xmm2 -; SSE2-NEXT:    packssdw %xmm5, %xmm2 -; SSE2-NEXT:    pslld $16, %xmm4 -; SSE2-NEXT:    psrad $16, %xmm4 -; SSE2-NEXT:    pslld $16, %xmm3 -; SSE2-NEXT:    psrad $16, %xmm3 -; SSE2-NEXT:    packssdw %xmm4, %xmm3 +; SSE2-NEXT:    pavgw (%rdi), %xmm1 +; SSE2-NEXT:    pavgw 16(%rdi), %xmm2 +; SSE2-NEXT:    pavgw 32(%rsi), %xmm0 +; SSE2-NEXT:    pavgw 48(%rdi), %xmm3  ; SSE2-NEXT:    movdqu %xmm3, (%rax) +; SSE2-NEXT:    movdqu %xmm0, (%rax)  ; SSE2-NEXT:    movdqu %xmm2, (%rax)  ; SSE2-NEXT:    movdqu %xmm1, (%rax) -; SSE2-NEXT:    movdqu %xmm9, (%rax)  ; SSE2-NEXT:    retq  ;  ; AVX1-LABEL: avg_v32i16:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm0, %xmm9 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm1, %xmm1 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm3, %xmm3 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm5, %xmm5 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm8, %xmm7 -; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT:    vpsubd %xmm0, %xmm9, %xmm8 -; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm1 -; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm2 -; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm3 -; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm4 -; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm5 -; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm6 -; AVX1-NEXT:    vpsubd %xmm0, %xmm7, %xmm0 -; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm9 -; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm6 -; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm5 -; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm7 -; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm0[1],xmm7[2],xmm0[3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] -; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX1-NEXT:    vpackusdw %xmm7, %xmm1, %xmm1 -; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] -; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] -; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4],xmm0[5],xmm6[6],xmm0[7] -; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2],xmm0[3],xmm9[4],xmm0[5],xmm9[6],xmm0[7] -; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT:    vmovups %ymm0, (%rax) +; AVX1-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT:    vmovdqa (%rsi), %ymm2 +; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm3 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5 +; AVX1-NEXT:    vpavgw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT:    vpavgw %xmm0, %xmm2, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4 +; AVX1-NEXT:    vpavgw %xmm2, %xmm4, %xmm2 +; AVX1-NEXT:    vpavgw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1  ; AVX1-NEXT:    vmovups %ymm1, (%rax) +; AVX1-NEXT:    vmovups %ymm0, (%rax)  ; AVX1-NEXT:    vzeroupper  ; AVX1-NEXT:    retq  ;  ; AVX2-LABEL: avg_v32i16:  ; AVX2:       # %bb.0: -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpaddd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpaddd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpaddd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4 -; AVX2-NEXT:    vpsubd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT:    vpsubd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT:    vpsubd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT:    vpsubd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT:    vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm1 -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm2 -; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-NEXT:    vmovdqu %ymm1, (%rax) +; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT:    vmovdqa (%rsi), %ymm1 +; AVX2-NEXT:    vpavgw (%rdi), %ymm1, %ymm1 +; AVX2-NEXT:    vpavgw 32(%rsi), %ymm0, %ymm0  ; AVX2-NEXT:    vmovdqu %ymm0, (%rax) +; AVX2-NEXT:    vmovdqu %ymm1, (%rax)  ; AVX2-NEXT:    vzeroupper  ; AVX2-NEXT:    retq  ;  ; AVX512F-LABEL: avg_v32i16:  ; AVX512F:       # %bb.0: -; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT:    vpsubd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT:    vpsubd %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT:    vpsrld $1, %zmm1, %zmm1 -; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0 -; AVX512F-NEXT:    vpmovdw %zmm0, (%rax) -; AVX512F-NEXT:    vpmovdw %zmm1, (%rax) +; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm0 +; AVX512F-NEXT:    vmovdqa (%rsi), %ymm1 +; AVX512F-NEXT:    vpavgw (%rdi), %ymm1, %ymm1 +; AVX512F-NEXT:    vpavgw 32(%rsi), %ymm0, %ymm0 +; AVX512F-NEXT:    vmovdqu %ymm0, (%rax) +; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)  ; AVX512F-NEXT:    vzeroupper  ; AVX512F-NEXT:    retq  ; @@ -1199,150 +890,23 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) nounwind {  define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind {  ; SSE2-LABEL: avg_v32i8_2:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    movdqa (%rdi), %xmm3 -; SSE2-NEXT:    movdqa 16(%rdi), %xmm8 -; SSE2-NEXT:    movdqa (%rsi), %xmm0 +; SSE2-NEXT:    movdqa (%rdi), %xmm0  ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1 -; SSE2-NEXT:    pxor %xmm4, %xmm4 -; SSE2-NEXT:    movdqa %xmm3, %xmm5 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE2-NEXT:    movdqa %xmm5, %xmm6 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT:    movdqa %xmm3, %xmm12 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT:    movdqa %xmm8, %xmm7 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; SSE2-NEXT:    movdqa %xmm7, %xmm11 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT:    movdqa %xmm8, %xmm10 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT:    movdqa %xmm0, %xmm2 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT:    movdqa %xmm2, %xmm9 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT:    paddd %xmm6, %xmm9 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT:    paddd %xmm5, %xmm2 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT:    movdqa %xmm0, %xmm5 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT:    paddd %xmm12, %xmm5 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT:    paddd %xmm3, %xmm0 -; SSE2-NEXT:    movdqa %xmm1, %xmm3 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT:    movdqa %xmm3, %xmm6 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT:    paddd %xmm11, %xmm6 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT:    paddd %xmm7, %xmm3 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT:    movdqa %xmm1, %xmm7 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT:    paddd %xmm10, %xmm7 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT:    paddd %xmm8, %xmm1 -; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT:    psubd %xmm4, %xmm9 -; SSE2-NEXT:    psubd %xmm4, %xmm2 -; SSE2-NEXT:    psubd %xmm4, %xmm5 -; SSE2-NEXT:    psubd %xmm4, %xmm0 -; SSE2-NEXT:    psubd %xmm4, %xmm6 -; SSE2-NEXT:    psubd %xmm4, %xmm3 -; SSE2-NEXT:    psubd %xmm4, %xmm7 -; SSE2-NEXT:    psubd %xmm4, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm7 -; SSE2-NEXT:    psrld $1, %xmm3 -; SSE2-NEXT:    psrld $1, %xmm6 -; SSE2-NEXT:    psrld $1, %xmm0 -; SSE2-NEXT:    psrld $1, %xmm5 -; SSE2-NEXT:    psrld $1, %xmm2 -; SSE2-NEXT:    psrld $1, %xmm9 -; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT:    pand %xmm4, %xmm9 -; SSE2-NEXT:    pand %xmm4, %xmm2 -; SSE2-NEXT:    packuswb %xmm9, %xmm2 -; SSE2-NEXT:    pand %xmm4, %xmm5 -; SSE2-NEXT:    pand %xmm4, %xmm0 -; SSE2-NEXT:    packuswb %xmm5, %xmm0 -; SSE2-NEXT:    packuswb %xmm2, %xmm0 -; SSE2-NEXT:    pand %xmm4, %xmm6 -; SSE2-NEXT:    pand %xmm4, %xmm3 -; SSE2-NEXT:    packuswb %xmm6, %xmm3 -; SSE2-NEXT:    pand %xmm4, %xmm7 -; SSE2-NEXT:    pand %xmm4, %xmm1 -; SSE2-NEXT:    packuswb %xmm7, %xmm1 -; SSE2-NEXT:    packuswb %xmm3, %xmm1 +; SSE2-NEXT:    pavgb (%rsi), %xmm0 +; SSE2-NEXT:    pavgb 16(%rdi), %xmm1  ; SSE2-NEXT:    movdqu %xmm1, (%rax)  ; SSE2-NEXT:    movdqu %xmm0, (%rax)  ; SSE2-NEXT:    retq  ;  ; AVX1-LABEL: avg_v32i8_2:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm0, %xmm9 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm1, %xmm1 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm3, %xmm3 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm5, %xmm5 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm8, %xmm7 -; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT:    vpsubd %xmm0, %xmm9, %xmm8 -; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm1 -; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm2 -; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm3 -; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm4 -; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm5 -; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm6 -; AVX1-NEXT:    vpsubd %xmm0, %xmm7, %xmm0 -; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm9 -; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm6 -; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm5 -; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm7 -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT:    vpand %xmm0, %xmm7, %xmm7 -; AVX1-NEXT:    vpand %xmm0, %xmm1, %xmm1 -; AVX1-NEXT:    vpackuswb %xmm7, %xmm1, %xmm1 -; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm2 -; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm3 -; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT:    vpand %xmm0, %xmm4, %xmm2 -; AVX1-NEXT:    vpand %xmm0, %xmm5, %xmm3 -; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpand %xmm0, %xmm6, %xmm3 -; AVX1-NEXT:    vpand %xmm0, %xmm9, %xmm0 -; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX1-NEXT:    vmovdqa (%rsi), %ymm1 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT:    vpavgb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0  ; AVX1-NEXT:    vmovups %ymm0, (%rax)  ; AVX1-NEXT:    vzeroupper  ; AVX1-NEXT:    retq @@ -1377,249 +941,31 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind {  define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {  ; SSE2-LABEL: avg_v64i8_2:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    movdqa (%rsi), %xmm14 -; SSE2-NEXT:    movdqa 16(%rsi), %xmm12 +; SSE2-NEXT:    movdqa (%rsi), %xmm0 +; SSE2-NEXT:    movdqa 16(%rsi), %xmm1  ; SSE2-NEXT:    movdqa 32(%rsi), %xmm2 -; SSE2-NEXT:    movdqa 48(%rsi), %xmm1 -; SSE2-NEXT:    pxor %xmm0, %xmm0 -; SSE2-NEXT:    movdqa %xmm14, %xmm7 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE2-NEXT:    movdqa %xmm7, %xmm15 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm14, %xmm8 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE2-NEXT:    movdqa %xmm12, %xmm6 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT:    movdqa %xmm6, %xmm13 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm12, %xmm9 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE2-NEXT:    movdqa %xmm2, %xmm5 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE2-NEXT:    movdqa %xmm5, %xmm11 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm2, %xmm10 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT:    movdqa %xmm1, %xmm4 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT:    movdqa %xmm4, %xmm3 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm1, %xmm3 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm1, %xmm1 -; SSE2-NEXT:    paddd %xmm3, %xmm3 -; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT:    paddd %xmm4, %xmm4 -; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT:    paddd %xmm3, %xmm3 -; SSE2-NEXT:    paddd %xmm2, %xmm2 -; SSE2-NEXT:    paddd %xmm10, %xmm10 -; SSE2-NEXT:    paddd %xmm5, %xmm5 -; SSE2-NEXT:    paddd %xmm11, %xmm11 -; SSE2-NEXT:    paddd %xmm12, %xmm12 -; SSE2-NEXT:    paddd %xmm9, %xmm9 -; SSE2-NEXT:    paddd %xmm6, %xmm6 -; SSE2-NEXT:    paddd %xmm13, %xmm13 -; SSE2-NEXT:    paddd %xmm14, %xmm14 -; SSE2-NEXT:    paddd %xmm8, %xmm8 -; SSE2-NEXT:    paddd %xmm7, %xmm7 -; SSE2-NEXT:    paddd %xmm15, %xmm15 -; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT:    psubd %xmm0, %xmm15 -; SSE2-NEXT:    psubd %xmm0, %xmm7 -; SSE2-NEXT:    psubd %xmm0, %xmm8 -; SSE2-NEXT:    psubd %xmm0, %xmm14 -; SSE2-NEXT:    psubd %xmm0, %xmm13 -; SSE2-NEXT:    psubd %xmm0, %xmm6 -; SSE2-NEXT:    psubd %xmm0, %xmm9 -; SSE2-NEXT:    psubd %xmm0, %xmm12 -; SSE2-NEXT:    psubd %xmm0, %xmm11 -; SSE2-NEXT:    psubd %xmm0, %xmm5 -; SSE2-NEXT:    psubd %xmm0, %xmm10 -; SSE2-NEXT:    psubd %xmm0, %xmm2 -; SSE2-NEXT:    psubd %xmm0, %xmm3 -; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT:    psubd %xmm0, %xmm4 -; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT:    psubd %xmm0, %xmm3 -; SSE2-NEXT:    psubd %xmm0, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm7 -; SSE2-NEXT:    psrld $1, %xmm15 -; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT:    pand %xmm0, %xmm15 -; SSE2-NEXT:    pand %xmm0, %xmm7 -; SSE2-NEXT:    packuswb %xmm15, %xmm7 -; SSE2-NEXT:    psrld $1, %xmm14 -; SSE2-NEXT:    psrld $1, %xmm8 -; SSE2-NEXT:    pand %xmm0, %xmm8 -; SSE2-NEXT:    pand %xmm0, %xmm14 -; SSE2-NEXT:    packuswb %xmm8, %xmm14 -; SSE2-NEXT:    packuswb %xmm7, %xmm14 -; SSE2-NEXT:    psrld $1, %xmm6 -; SSE2-NEXT:    psrld $1, %xmm13 -; SSE2-NEXT:    pand %xmm0, %xmm13 -; SSE2-NEXT:    pand %xmm0, %xmm6 -; SSE2-NEXT:    packuswb %xmm13, %xmm6 -; SSE2-NEXT:    psrld $1, %xmm12 -; SSE2-NEXT:    psrld $1, %xmm9 -; SSE2-NEXT:    pand %xmm0, %xmm9 -; SSE2-NEXT:    pand %xmm0, %xmm12 -; SSE2-NEXT:    packuswb %xmm9, %xmm12 -; SSE2-NEXT:    packuswb %xmm6, %xmm12 -; SSE2-NEXT:    psrld $1, %xmm5 -; SSE2-NEXT:    psrld $1, %xmm11 -; SSE2-NEXT:    pand %xmm0, %xmm11 -; SSE2-NEXT:    pand %xmm0, %xmm5 -; SSE2-NEXT:    packuswb %xmm11, %xmm5 -; SSE2-NEXT:    psrld $1, %xmm2 -; SSE2-NEXT:    psrld $1, %xmm10 -; SSE2-NEXT:    pand %xmm0, %xmm10 -; SSE2-NEXT:    pand %xmm0, %xmm2 -; SSE2-NEXT:    packuswb %xmm10, %xmm2 -; SSE2-NEXT:    packuswb %xmm5, %xmm2 -; SSE2-NEXT:    psrld $1, %xmm4 -; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; SSE2-NEXT:    psrld $1, %xmm5 -; SSE2-NEXT:    pand %xmm0, %xmm5 -; SSE2-NEXT:    pand %xmm0, %xmm4 -; SSE2-NEXT:    packuswb %xmm5, %xmm4 -; SSE2-NEXT:    psrld $1, %xmm1 -; SSE2-NEXT:    movdqa %xmm3, %xmm5 -; SSE2-NEXT:    psrld $1, %xmm5 -; SSE2-NEXT:    pand %xmm0, %xmm5 -; SSE2-NEXT:    pand %xmm0, %xmm1 -; SSE2-NEXT:    packuswb %xmm5, %xmm1 -; SSE2-NEXT:    packuswb %xmm4, %xmm1 -; SSE2-NEXT:    movdqu %xmm1, (%rax) +; SSE2-NEXT:    movdqa 48(%rsi), %xmm3 +; SSE2-NEXT:    pavgb %xmm0, %xmm0 +; SSE2-NEXT:    pavgb %xmm1, %xmm1 +; SSE2-NEXT:    pavgb %xmm2, %xmm2 +; SSE2-NEXT:    pavgb %xmm3, %xmm3 +; SSE2-NEXT:    movdqu %xmm3, (%rax)  ; SSE2-NEXT:    movdqu %xmm2, (%rax) -; SSE2-NEXT:    movdqu %xmm12, (%rax) -; SSE2-NEXT:    movdqu %xmm14, (%rax) +; SSE2-NEXT:    movdqu %xmm1, (%rax) +; SSE2-NEXT:    movdqu %xmm0, (%rax)  ; SSE2-NEXT:    retq  ;  ; AVX1-LABEL: avg_v64i8_2:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm7, %xmm7 -; AVX1-NEXT:    vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpaddd %xmm6, %xmm6, %xmm6 -; AVX1-NEXT:    vmovdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpaddd %xmm5, %xmm5, %xmm6 -; AVX1-NEXT:    vpaddd %xmm4, %xmm4, %xmm5 -; AVX1-NEXT:    vpaddd %xmm3, %xmm3, %xmm4 -; AVX1-NEXT:    vpaddd %xmm2, %xmm2, %xmm3 -; AVX1-NEXT:    vpaddd %xmm1, %xmm1, %xmm2 -; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm1 -; AVX1-NEXT:    vpaddd %xmm15, %xmm15, %xmm15 -; AVX1-NEXT:    vpaddd %xmm14, %xmm14, %xmm14 -; AVX1-NEXT:    vpaddd %xmm13, %xmm13, %xmm13 -; AVX1-NEXT:    vpaddd %xmm12, %xmm12, %xmm12 -; AVX1-NEXT:    vpaddd %xmm11, %xmm11, %xmm11 -; AVX1-NEXT:    vpaddd %xmm10, %xmm10, %xmm10 -; AVX1-NEXT:    vpaddd %xmm9, %xmm9, %xmm9 -; AVX1-NEXT:    vpaddd %xmm8, %xmm8, %xmm8 -; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT:    vpsubd %xmm0, %xmm8, %xmm7 -; AVX1-NEXT:    vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpsubd %xmm0, %xmm9, %xmm8 -; AVX1-NEXT:    vpsubd %xmm0, %xmm10, %xmm10 -; AVX1-NEXT:    vpsubd %xmm0, %xmm11, %xmm9 -; AVX1-NEXT:    vpsubd %xmm0, %xmm12, %xmm7 -; AVX1-NEXT:    vmovdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpsubd %xmm0, %xmm13, %xmm11 -; AVX1-NEXT:    vpsubd %xmm0, %xmm14, %xmm13 -; AVX1-NEXT:    vpsubd %xmm0, %xmm15, %xmm12 -; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm1 -; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm15 -; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm2 -; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm14 -; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm3 -; AVX1-NEXT:    vmovdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm5 -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm3 -; AVX1-NEXT:    vmovdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm6 -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm8 -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT:    vpand %xmm7, %xmm8, %xmm8 -; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6 -; AVX1-NEXT:    vpackuswb %xmm8, %xmm6, %xmm8 -; AVX1-NEXT:    vpsrld $1, %xmm9, %xmm6 -; AVX1-NEXT:    vpsrld $1, %xmm10, %xmm4 -; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4 -; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6 -; AVX1-NEXT:    vpackuswb %xmm4, %xmm6, %xmm4 -; AVX1-NEXT:    vpackuswb %xmm8, %xmm4, %xmm4 -; AVX1-NEXT:    vpsrld $1, %xmm11, %xmm6 -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm3 -; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6 -; AVX1-NEXT:    vpackuswb %xmm3, %xmm6, %xmm3 -; AVX1-NEXT:    vpsrld $1, %xmm12, %xmm6 -; AVX1-NEXT:    vpsrld $1, %xmm13, %xmm0 -; AVX1-NEXT:    vpand %xmm7, %xmm0, %xmm0 -; AVX1-NEXT:    vpand %xmm7, %xmm6, %xmm6 -; AVX1-NEXT:    vpackuswb %xmm0, %xmm6, %xmm0 -; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT:    vpsrld $1, %xmm15, %xmm3 -; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT:    vpand %xmm7, %xmm1, %xmm1 -; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm14, %xmm3 -; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2 -; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm2 -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2 -; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4 -; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT:    vmovdqa (%rsi), %ymm0 +; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm1 +; AVX1-NEXT:    vpavgb %xmm0, %xmm0, %xmm2 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT:    vpavgb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT:    vpavgb %xmm1, %xmm1, %xmm2 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT:    vpavgb %xmm1, %xmm1, %xmm1  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1  ; AVX1-NEXT:    vmovups %ymm1, (%rax)  ; AVX1-NEXT:    vmovups %ymm0, (%rax) @@ -1628,71 +974,10 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {  ;  ; AVX2-LABEL: avg_v64i8_2:  ; AVX2:       # %bb.0: -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpaddd %ymm7, %ymm7, %ymm7 -; AVX2-NEXT:    vpaddd %ymm6, %ymm6, %ymm6 -; AVX2-NEXT:    vpaddd %ymm5, %ymm5, %ymm5 -; AVX2-NEXT:    vpaddd %ymm4, %ymm4, %ymm4 -; AVX2-NEXT:    vpaddd %ymm3, %ymm3, %ymm3 -; AVX2-NEXT:    vpaddd %ymm2, %ymm2, %ymm2 -; AVX2-NEXT:    vpaddd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT:    vpaddd %ymm0, %ymm0, %ymm0 -; AVX2-NEXT:    vpcmpeqd %ymm8, %ymm8, %ymm8 -; AVX2-NEXT:    vpsubd %ymm8, %ymm0, %ymm9 -; AVX2-NEXT:    vpsubd %ymm8, %ymm1, %ymm10 -; AVX2-NEXT:    vpsubd %ymm8, %ymm2, %ymm2 -; AVX2-NEXT:    vpsubd %ymm8, %ymm3, %ymm3 -; AVX2-NEXT:    vpsubd %ymm8, %ymm4, %ymm4 -; AVX2-NEXT:    vpsubd %ymm8, %ymm5, %ymm5 -; AVX2-NEXT:    vpsubd %ymm8, %ymm6, %ymm1 -; AVX2-NEXT:    vpsubd %ymm8, %ymm7, %ymm0 -; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm11 -; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm12 -; AVX2-NEXT:    vpsrld $1, %ymm5, %ymm5 -; AVX2-NEXT:    vpsrld $1, %ymm4, %ymm4 -; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm6 -; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm7 -; AVX2-NEXT:    vpsrld $1, %ymm10, %ymm8 -; AVX2-NEXT:    vpsrld $1, %ymm9, %ymm3 -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm3 -; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm3[0,2,2,3] -; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT:    vpshufb %xmm3, %xmm9, %xmm0 -; AVX2-NEXT:    vpshufb %ymm2, %ymm8, %ymm8 -; AVX2-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] -; AVX2-NEXT:    vpshufb %xmm3, %xmm8, %xmm1 -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT:    vpshufb %ymm2, %ymm7, %ymm1 -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT:    vpshufb %ymm2, %ymm6, %ymm6 -; AVX2-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm6 -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm6[0],xmm1[0] -; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT:    vpshufb %ymm2, %ymm4, %ymm1 -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm4 -; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] -; AVX2-NEXT:    vpshufb %ymm2, %ymm12, %ymm4 -; AVX2-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT:    vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-NEXT:    vpshufb %ymm2, %ymm11, %ymm2 -; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] -; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-NEXT:    vmovdqa (%rsi), %ymm0 +; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm1 +; AVX2-NEXT:    vpavgb %ymm0, %ymm0, %ymm0 +; AVX2-NEXT:    vpavgb %ymm1, %ymm1, %ymm1  ; AVX2-NEXT:    vmovdqu %ymm1, (%rax)  ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)  ; AVX2-NEXT:    vzeroupper @@ -1700,29 +985,10 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {  ;  ; AVX512F-LABEL: avg_v64i8_2:  ; AVX512F:       # %bb.0: -; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT:    vpaddd %zmm3, %zmm3, %zmm3 -; AVX512F-NEXT:    vpaddd %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT:    vpaddd %zmm1, %zmm1, %zmm1 -; AVX512F-NEXT:    vpaddd %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 -; AVX512F-NEXT:    vpsubd %zmm4, %zmm0, %zmm0 -; AVX512F-NEXT:    vpsubd %zmm4, %zmm1, %zmm1 -; AVX512F-NEXT:    vpsubd %zmm4, %zmm2, %zmm2 -; AVX512F-NEXT:    vpsubd %zmm4, %zmm3, %zmm3 -; AVX512F-NEXT:    vpsrld $1, %zmm3, %zmm3 -; AVX512F-NEXT:    vpsrld $1, %zmm2, %zmm2 -; AVX512F-NEXT:    vpsrld $1, %zmm1, %zmm1 -; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0 -; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT:    vpmovdb %zmm2, %xmm1 -; AVX512F-NEXT:    vpmovdb %zmm3, %xmm2 -; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0 +; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm1 +; AVX512F-NEXT:    vpavgb %ymm0, %ymm0, %ymm0 +; AVX512F-NEXT:    vpavgb %ymm1, %ymm1, %ymm1  ; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)  ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)  ; AVX512F-NEXT:    vzeroupper @@ -1805,81 +1071,23 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) nounwind {  define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind {  ; SSE2-LABEL: avg_v16i16_2:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    movdqa (%rdi), %xmm2 -; SSE2-NEXT:    movdqa 16(%rdi), %xmm4 -; SSE2-NEXT:    movdqa (%rsi), %xmm0 +; SSE2-NEXT:    movdqa (%rdi), %xmm0  ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1 -; SSE2-NEXT:    pxor %xmm5, %xmm5 -; SSE2-NEXT:    movdqa %xmm2, %xmm6 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] -; SSE2-NEXT:    movdqa %xmm4, %xmm7 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; SSE2-NEXT:    movdqa %xmm0, %xmm3 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE2-NEXT:    paddd %xmm6, %xmm3 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT:    paddd %xmm2, %xmm0 -; SSE2-NEXT:    movdqa %xmm1, %xmm2 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE2-NEXT:    paddd %xmm7, %xmm2 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] -; SSE2-NEXT:    paddd %xmm4, %xmm1 -; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4 -; SSE2-NEXT:    psubd %xmm4, %xmm3 -; SSE2-NEXT:    psubd %xmm4, %xmm0 -; SSE2-NEXT:    psubd %xmm4, %xmm2 -; SSE2-NEXT:    psubd %xmm4, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm2 -; SSE2-NEXT:    psrld $1, %xmm0 -; SSE2-NEXT:    psrld $1, %xmm3 -; SSE2-NEXT:    pslld $16, %xmm3 -; SSE2-NEXT:    psrad $16, %xmm3 -; SSE2-NEXT:    pslld $16, %xmm0 -; SSE2-NEXT:    psrad $16, %xmm0 -; SSE2-NEXT:    packssdw %xmm3, %xmm0 -; SSE2-NEXT:    pslld $16, %xmm2 -; SSE2-NEXT:    psrad $16, %xmm2 -; SSE2-NEXT:    pslld $16, %xmm1 -; SSE2-NEXT:    psrad $16, %xmm1 -; SSE2-NEXT:    packssdw %xmm2, %xmm1 +; SSE2-NEXT:    pavgw (%rsi), %xmm0 +; SSE2-NEXT:    pavgw 16(%rdi), %xmm1  ; SSE2-NEXT:    movdqu %xmm1, (%rax)  ; SSE2-NEXT:    movdqu %xmm0, (%rax)  ; SSE2-NEXT:    retq  ;  ; AVX1-LABEL: avg_v16i16_2:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT:    vpsubd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT:    vpsubd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT:    vpsubd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT:    vpsubd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] -; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] -; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] -; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] -; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX1-NEXT:    vmovdqa (%rsi), %ymm1 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT:    vpavgw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0  ; AVX1-NEXT:    vmovups %ymm0, (%rax)  ; AVX1-NEXT:    vzeroupper  ; AVX1-NEXT:    retq @@ -1914,187 +1122,47 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind {  define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {  ; SSE2-LABEL: avg_v32i16_2:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    movdqa (%rdi), %xmm4 -; SSE2-NEXT:    movdqa 16(%rdi), %xmm11 -; SSE2-NEXT:    movdqa 32(%rdi), %xmm10 -; SSE2-NEXT:    movdqa 48(%rdi), %xmm8 -; SSE2-NEXT:    movdqa (%rsi), %xmm9 -; SSE2-NEXT:    movdqa 16(%rsi), %xmm1 -; SSE2-NEXT:    movdqa 32(%rsi), %xmm2 -; SSE2-NEXT:    movdqa 48(%rsi), %xmm3 -; SSE2-NEXT:    pxor %xmm0, %xmm0 -; SSE2-NEXT:    movdqa %xmm4, %xmm6 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT:    movdqa %xmm11, %xmm5 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] -; SSE2-NEXT:    movdqa %xmm10, %xmm12 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT:    movdqa %xmm8, %xmm13 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT:    movdqa %xmm9, %xmm7 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT:    paddd %xmm6, %xmm7 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm4, %xmm9 -; SSE2-NEXT:    movdqa %xmm1, %xmm6 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT:    paddd %xmm5, %xmm6 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm11, %xmm1 -; SSE2-NEXT:    movdqa %xmm2, %xmm5 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT:    paddd %xmm12, %xmm5 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm10, %xmm2 -; SSE2-NEXT:    movdqa %xmm3, %xmm4 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT:    paddd %xmm13, %xmm4 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT:    paddd %xmm8, %xmm3 -; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0 -; SSE2-NEXT:    psubd %xmm0, %xmm7 -; SSE2-NEXT:    psubd %xmm0, %xmm9 -; SSE2-NEXT:    psubd %xmm0, %xmm6 -; SSE2-NEXT:    psubd %xmm0, %xmm1 -; SSE2-NEXT:    psubd %xmm0, %xmm5 -; SSE2-NEXT:    psubd %xmm0, %xmm2 -; SSE2-NEXT:    psubd %xmm0, %xmm4 -; SSE2-NEXT:    psubd %xmm0, %xmm3 -; SSE2-NEXT:    psrld $1, %xmm3 -; SSE2-NEXT:    psrld $1, %xmm4 -; SSE2-NEXT:    psrld $1, %xmm2 -; SSE2-NEXT:    psrld $1, %xmm5 -; SSE2-NEXT:    psrld $1, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm6 -; SSE2-NEXT:    psrld $1, %xmm9 -; SSE2-NEXT:    psrld $1, %xmm7 -; SSE2-NEXT:    pslld $16, %xmm7 -; SSE2-NEXT:    psrad $16, %xmm7 -; SSE2-NEXT:    pslld $16, %xmm9 -; SSE2-NEXT:    psrad $16, %xmm9 -; SSE2-NEXT:    packssdw %xmm7, %xmm9 -; SSE2-NEXT:    pslld $16, %xmm6 -; SSE2-NEXT:    psrad $16, %xmm6 -; SSE2-NEXT:    pslld $16, %xmm1 -; SSE2-NEXT:    psrad $16, %xmm1 -; SSE2-NEXT:    packssdw %xmm6, %xmm1 -; SSE2-NEXT:    pslld $16, %xmm5 -; SSE2-NEXT:    psrad $16, %xmm5 -; SSE2-NEXT:    pslld $16, %xmm2 -; SSE2-NEXT:    psrad $16, %xmm2 -; SSE2-NEXT:    packssdw %xmm5, %xmm2 -; SSE2-NEXT:    pslld $16, %xmm4 -; SSE2-NEXT:    psrad $16, %xmm4 -; SSE2-NEXT:    pslld $16, %xmm3 -; SSE2-NEXT:    psrad $16, %xmm3 -; SSE2-NEXT:    packssdw %xmm4, %xmm3 -; SSE2-NEXT:    movdqu %xmm3, (%rax) +; SSE2-NEXT:    movdqa (%rdi), %xmm0 +; SSE2-NEXT:    movdqa 16(%rdi), %xmm1 +; SSE2-NEXT:    movdqa 48(%rdi), %xmm2 +; SSE2-NEXT:    movdqa 32(%rsi), %xmm3 +; SSE2-NEXT:    pavgw (%rsi), %xmm0 +; SSE2-NEXT:    pavgw 16(%rsi), %xmm1 +; SSE2-NEXT:    pavgw 32(%rdi), %xmm3 +; SSE2-NEXT:    pavgw 48(%rsi), %xmm2  ; SSE2-NEXT:    movdqu %xmm2, (%rax) +; SSE2-NEXT:    movdqu %xmm3, (%rax)  ; SSE2-NEXT:    movdqu %xmm1, (%rax) -; SSE2-NEXT:    movdqu %xmm9, (%rax) +; SSE2-NEXT:    movdqu %xmm0, (%rax)  ; SSE2-NEXT:    retq  ;  ; AVX1-LABEL: avg_v32i16_2:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm0, %xmm9 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm1, %xmm1 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm3, %xmm3 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm5, %xmm5 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpaddd %xmm7, %xmm8, %xmm7 -; AVX1-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT:    vpsubd %xmm0, %xmm9, %xmm8 -; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm1 -; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm2 -; AVX1-NEXT:    vpsubd %xmm0, %xmm3, %xmm3 -; AVX1-NEXT:    vpsubd %xmm0, %xmm4, %xmm4 -; AVX1-NEXT:    vpsubd %xmm0, %xmm5, %xmm5 -; AVX1-NEXT:    vpsubd %xmm0, %xmm6, %xmm6 -; AVX1-NEXT:    vpsubd %xmm0, %xmm7, %xmm0 -; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm9 -; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm6 -; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm5 -; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2 -; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm7 -; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0 -; AVX1-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm0[1],xmm7[2],xmm0[3],xmm7[4],xmm0[5],xmm7[6],xmm0[7] -; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] -; AVX1-NEXT:    vpackusdw %xmm7, %xmm1, %xmm1 -; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] -; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] -; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4],xmm0[5],xmm6[6],xmm0[7] -; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2],xmm0[3],xmm9[4],xmm0[5],xmm9[6],xmm0[7] -; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT:    vmovups %ymm0, (%rax) +; AVX1-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT:    vmovdqa (%rsi), %ymm2 +; AVX1-NEXT:    vmovdqa 32(%rsi), %ymm3 +; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT:    vpavgw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT:    vpavgw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT:    vpavgw %xmm2, %xmm4, %xmm2 +; AVX1-NEXT:    vpavgw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1  ; AVX1-NEXT:    vmovups %ymm1, (%rax) +; AVX1-NEXT:    vmovups %ymm0, (%rax)  ; AVX1-NEXT:    vzeroupper  ; AVX1-NEXT:    retq  ;  ; AVX2-LABEL: avg_v32i16_2:  ; AVX2:       # %bb.0: -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpaddd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpaddd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpaddd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4 -; AVX2-NEXT:    vpsubd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT:    vpsubd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT:    vpsubd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT:    vpsubd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0 -; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT:    vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm1 -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm2 -; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX2-NEXT:    vmovdqa 32(%rsi), %ymm1 +; AVX2-NEXT:    vpavgw (%rsi), %ymm0, %ymm0 +; AVX2-NEXT:    vpavgw 32(%rdi), %ymm1, %ymm1  ; AVX2-NEXT:    vmovdqu %ymm1, (%rax)  ; AVX2-NEXT:    vmovdqu %ymm0, (%rax)  ; AVX2-NEXT:    vzeroupper @@ -2102,19 +1170,12 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {  ;  ; AVX512F-LABEL: avg_v32i16_2:  ; AVX512F:       # %bb.0: -; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT:    vpsubd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT:    vpsubd %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT:    vpsrld $1, %zmm1, %zmm1 -; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0 -; AVX512F-NEXT:    vpmovdw %zmm0, (%rax) -; AVX512F-NEXT:    vpmovdw %zmm1, (%rax) +; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm1 +; AVX512F-NEXT:    vpavgw (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT:    vpavgw 32(%rdi), %ymm1, %ymm1 +; AVX512F-NEXT:    vmovdqu %ymm1, (%rax) +; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)  ; AVX512F-NEXT:    vzeroupper  ; AVX512F-NEXT:    retq  ; @@ -2209,89 +1270,21 @@ define void @avg_v16i8_const(<16 x i8>* %a) nounwind {  define void @avg_v32i8_const(<32 x i8>* %a) nounwind {  ; SSE2-LABEL: avg_v32i8_const:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    movdqa (%rdi), %xmm0 -; SSE2-NEXT:    movdqa 16(%rdi), %xmm3 -; SSE2-NEXT:    pxor %xmm4, %xmm4 -; SSE2-NEXT:    movdqa %xmm3, %xmm1 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT:    movdqa %xmm1, %xmm7 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT:    movdqa %xmm3, %xmm6 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT:    movdqa %xmm0, %xmm2 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT:    movdqa %xmm2, %xmm5 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] -; SSE2-NEXT:    movdqa %xmm0, %xmm8 -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [1,2,3,4] -; SSE2-NEXT:    paddd %xmm9, %xmm0 -; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [5,6,7,8] -; SSE2-NEXT:    paddd %xmm4, %xmm8 -; SSE2-NEXT:    paddd %xmm9, %xmm2 -; SSE2-NEXT:    paddd %xmm4, %xmm5 -; SSE2-NEXT:    paddd %xmm9, %xmm3 -; SSE2-NEXT:    paddd %xmm4, %xmm6 -; SSE2-NEXT:    paddd %xmm9, %xmm1 -; SSE2-NEXT:    paddd %xmm4, %xmm7 -; SSE2-NEXT:    psrld $1, %xmm7 -; SSE2-NEXT:    psrld $1, %xmm1 -; SSE2-NEXT:    packuswb %xmm7, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm6 -; SSE2-NEXT:    psrld $1, %xmm3 -; SSE2-NEXT:    packuswb %xmm6, %xmm3 -; SSE2-NEXT:    packuswb %xmm3, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm5 -; SSE2-NEXT:    psrld $1, %xmm2 -; SSE2-NEXT:    packuswb %xmm5, %xmm2 -; SSE2-NEXT:    psrld $1, %xmm8 -; SSE2-NEXT:    psrld $1, %xmm0 -; SSE2-NEXT:    packuswb %xmm8, %xmm0 -; SSE2-NEXT:    packuswb %xmm0, %xmm2 +; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; SSE2-NEXT:    movdqa (%rdi), %xmm1 +; SSE2-NEXT:    pavgb %xmm0, %xmm1 +; SSE2-NEXT:    pavgb 16(%rdi), %xmm0 +; SSE2-NEXT:    movdqu %xmm0, (%rax)  ; SSE2-NEXT:    movdqu %xmm1, (%rax) -; SSE2-NEXT:    movdqu %xmm2, (%rax)  ; SSE2-NEXT:    retq  ;  ; AVX1-LABEL: avg_v32i8_const:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,2,3,4] -; AVX1-NEXT:    vpaddd %xmm0, %xmm7, %xmm9 -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [5,6,7,8] -; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT:    vpaddd %xmm0, %xmm5, %xmm5 -; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm3 -; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT:    vpaddd %xmm7, %xmm8, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm2 -; AVX1-NEXT:    vpackssdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm2 -; AVX1-NEXT:    vpackssdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm2 -; AVX1-NEXT:    vpsrld $1, %xmm9, %xmm3 -; AVX1-NEXT:    vpackssdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX1-NEXT:    vpavgb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT:    vpavgb %xmm2, %xmm0, %xmm0  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    vmovups %ymm0, (%rax)  ; AVX1-NEXT:    vzeroupper @@ -2324,211 +1317,33 @@ define void @avg_v32i8_const(<32 x i8>* %a) nounwind {  define void @avg_v64i8_const(<64 x i8>* %a) nounwind {  ; SSE2-LABEL: avg_v64i8_const:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    movdqa (%rdi), %xmm5 -; SSE2-NEXT:    movdqa 16(%rdi), %xmm6 -; SSE2-NEXT:    movdqa 32(%rdi), %xmm15 -; SSE2-NEXT:    movdqa 48(%rdi), %xmm11 -; SSE2-NEXT:    pxor %xmm0, %xmm0 -; SSE2-NEXT:    movdqa %xmm11, %xmm1 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm1, %xmm10 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm1, %xmm9 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] -; SSE2-NEXT:    movdqa %xmm11, %xmm1 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm15, %xmm14 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm14, %xmm13 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] -; SSE2-NEXT:    movdqa %xmm15, %xmm12 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm6, %xmm3 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm3, %xmm8 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT:    movdqa %xmm6, %xmm4 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm5, %xmm2 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT:    movdqa %xmm2, %xmm1 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE2-NEXT:    movdqa %xmm5, %xmm7 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [5,6,7,8] -; SSE2-NEXT:    paddd %xmm0, %xmm5 -; SSE2-NEXT:    paddd %xmm0, %xmm2 -; SSE2-NEXT:    paddd %xmm0, %xmm6 -; SSE2-NEXT:    paddd %xmm0, %xmm3 -; SSE2-NEXT:    paddd %xmm0, %xmm15 -; SSE2-NEXT:    paddd %xmm0, %xmm14 -; SSE2-NEXT:    paddd %xmm0, %xmm11 -; SSE2-NEXT:    paddd %xmm0, %xmm9 -; SSE2-NEXT:    movdqa %xmm9, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,2,3,4] -; SSE2-NEXT:    paddd %xmm0, %xmm7 -; SSE2-NEXT:    paddd %xmm0, %xmm1 -; SSE2-NEXT:    paddd %xmm0, %xmm4 -; SSE2-NEXT:    paddd %xmm0, %xmm8 -; SSE2-NEXT:    paddd %xmm0, %xmm12 -; SSE2-NEXT:    paddd %xmm0, %xmm13 -; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Reload -; SSE2-NEXT:    paddd %xmm0, %xmm9 -; SSE2-NEXT:    movdqa %xmm9, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT:    paddd %xmm0, %xmm10 -; SSE2-NEXT:    psrld $1, %xmm7 -; SSE2-NEXT:    psrld $1, %xmm5 -; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT:    pand %xmm0, %xmm5 -; SSE2-NEXT:    pand %xmm0, %xmm7 -; SSE2-NEXT:    packuswb %xmm5, %xmm7 -; SSE2-NEXT:    psrld $1, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm2 -; SSE2-NEXT:    pand %xmm0, %xmm2 -; SSE2-NEXT:    pand %xmm0, %xmm1 -; SSE2-NEXT:    packuswb %xmm2, %xmm1 -; SSE2-NEXT:    packuswb %xmm7, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm4 -; SSE2-NEXT:    psrld $1, %xmm6 -; SSE2-NEXT:    pand %xmm0, %xmm6 -; SSE2-NEXT:    pand %xmm0, %xmm4 -; SSE2-NEXT:    packuswb %xmm6, %xmm4 -; SSE2-NEXT:    psrld $1, %xmm8 -; SSE2-NEXT:    psrld $1, %xmm3 -; SSE2-NEXT:    pand %xmm0, %xmm3 -; SSE2-NEXT:    pand %xmm0, %xmm8 -; SSE2-NEXT:    packuswb %xmm3, %xmm8 -; SSE2-NEXT:    packuswb %xmm4, %xmm8 -; SSE2-NEXT:    psrld $1, %xmm12 -; SSE2-NEXT:    psrld $1, %xmm15 -; SSE2-NEXT:    pand %xmm0, %xmm15 -; SSE2-NEXT:    pand %xmm0, %xmm12 -; SSE2-NEXT:    packuswb %xmm15, %xmm12 -; SSE2-NEXT:    psrld $1, %xmm13 -; SSE2-NEXT:    psrld $1, %xmm14 -; SSE2-NEXT:    pand %xmm0, %xmm14 -; SSE2-NEXT:    pand %xmm0, %xmm13 -; SSE2-NEXT:    packuswb %xmm14, %xmm13 -; SSE2-NEXT:    packuswb %xmm12, %xmm13 -; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT:    psrld $1, %xmm2 -; SSE2-NEXT:    psrld $1, %xmm11 -; SSE2-NEXT:    pand %xmm0, %xmm11 -; SSE2-NEXT:    pand %xmm0, %xmm2 -; SSE2-NEXT:    packuswb %xmm11, %xmm2 -; SSE2-NEXT:    psrld $1, %xmm10 -; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT:    psrld $1, %xmm3 -; SSE2-NEXT:    pand %xmm0, %xmm3 -; SSE2-NEXT:    pand %xmm0, %xmm10 -; SSE2-NEXT:    packuswb %xmm3, %xmm10 -; SSE2-NEXT:    packuswb %xmm2, %xmm10 -; SSE2-NEXT:    movdqu %xmm10, (%rax) -; SSE2-NEXT:    movdqu %xmm13, (%rax) -; SSE2-NEXT:    movdqu %xmm8, (%rax) +; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; SSE2-NEXT:    movdqa (%rdi), %xmm1 +; SSE2-NEXT:    pavgb %xmm0, %xmm1 +; SSE2-NEXT:    movdqa 16(%rdi), %xmm2 +; SSE2-NEXT:    pavgb %xmm0, %xmm2 +; SSE2-NEXT:    movdqa 32(%rdi), %xmm3 +; SSE2-NEXT:    pavgb %xmm0, %xmm3 +; SSE2-NEXT:    pavgb 48(%rdi), %xmm0 +; SSE2-NEXT:    movdqu %xmm0, (%rax) +; SSE2-NEXT:    movdqu %xmm3, (%rax) +; SSE2-NEXT:    movdqu %xmm2, (%rax)  ; SSE2-NEXT:    movdqu %xmm1, (%rax)  ; SSE2-NEXT:    retq  ;  ; AVX1-LABEL: avg_v64i8_const:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [5,6,7,8] -; AVX1-NEXT:    vpaddd %xmm0, %xmm5, %xmm5 -; AVX1-NEXT:    vpaddd %xmm0, %xmm7, %xmm15 -; AVX1-NEXT:    vpaddd %xmm0, %xmm13, %xmm13 -; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm7 -; AVX1-NEXT:    vpaddd %xmm0, %xmm11, %xmm11 -; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm1 -; AVX1-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vpaddd %xmm0, %xmm9, %xmm9 -; AVX1-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,3,4] -; AVX1-NEXT:    vpaddd %xmm2, %xmm12, %xmm0 -; AVX1-NEXT:    vpaddd %xmm2, %xmm10, %xmm10 -; AVX1-NEXT:    vpaddd %xmm2, %xmm8, %xmm8 -; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm4 -; AVX1-NEXT:    vpaddd %xmm2, %xmm6, %xmm1 -; AVX1-NEXT:    vpaddd %xmm2, %xmm14, %xmm6 -; AVX1-NEXT:    vpaddd -{{[0-9]+}}(%rsp), %xmm2, %xmm12 # 16-byte Folded Reload -; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm14 -; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm3 -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0 -; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT:    vpsrld $1, %xmm10, %xmm3 -; AVX1-NEXT:    vpsrld $1, %xmm15, %xmm2 -; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2 -; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpackuswb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm2 -; AVX1-NEXT:    vpsrld $1, %xmm13, %xmm3 -; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2 -; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm3 -; AVX1-NEXT:    vpsrld $1, %xmm7, %xmm4 -; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm11, %xmm2 -; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2 -; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1 -; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm2 -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3 -; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2 -; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm12, %xmm2 -; AVX1-NEXT:    vpsrld $1, %xmm9, %xmm3 -; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2 -; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT:    vpsrld $1, %xmm14, %xmm3 -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4 -; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4 -; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3 -; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX1-NEXT:    vpavgb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT:    vpavgb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vpavgb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT:    vpavgb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1  ; AVX1-NEXT:    vmovups %ymm1, (%rax)  ; AVX1-NEXT:    vmovups %ymm0, (%rax)  ; AVX1-NEXT:    vzeroupper @@ -2536,82 +1351,21 @@ define void @avg_v64i8_const(<64 x i8>* %a) nounwind {  ;  ; AVX2-LABEL: avg_v64i8_const:  ; AVX2:       # %bb.0: -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm8 = [1,2,3,4,5,6,7,8] -; AVX2-NEXT:    vpaddd %ymm8, %ymm7, %ymm7 -; AVX2-NEXT:    vpaddd %ymm8, %ymm6, %ymm6 -; AVX2-NEXT:    vpaddd %ymm8, %ymm5, %ymm5 -; AVX2-NEXT:    vpaddd %ymm8, %ymm4, %ymm4 -; AVX2-NEXT:    vpaddd %ymm8, %ymm3, %ymm3 -; AVX2-NEXT:    vpaddd %ymm8, %ymm2, %ymm2 -; AVX2-NEXT:    vpaddd %ymm8, %ymm1, %ymm1 -; AVX2-NEXT:    vpaddd %ymm8, %ymm0, %ymm0 -; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm8 -; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT:    vpsrld $1, %ymm4, %ymm4 -; AVX2-NEXT:    vpsrld $1, %ymm5, %ymm5 -; AVX2-NEXT:    vpsrld $1, %ymm6, %ymm6 -; AVX2-NEXT:    vpsrld $1, %ymm7, %ymm7 -; AVX2-NEXT:    vextracti128 $1, %ymm7, %xmm0 -; AVX2-NEXT:    vpackssdw %xmm0, %xmm7, %xmm0 -; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7 -; AVX2-NEXT:    vpackssdw %xmm7, %xmm6, %xmm6 -; AVX2-NEXT:    vpackuswb %xmm0, %xmm6, %xmm0 -; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6 -; AVX2-NEXT:    vpackssdw %xmm6, %xmm5, %xmm5 -; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm6 -; AVX2-NEXT:    vpackssdw %xmm6, %xmm4, %xmm4 -; AVX2-NEXT:    vpackuswb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0 -; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT:    vpackssdw %xmm4, %xmm3, %xmm3 -; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT:    vpackssdw %xmm4, %xmm2, %xmm2 -; AVX2-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT:    vpackssdw %xmm3, %xmm1, %xmm1 -; AVX2-NEXT:    vextracti128 $1, %ymm8, %xmm3 -; AVX2-NEXT:    vpackssdw %xmm3, %xmm8, %xmm3 -; AVX2-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1 -; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-NEXT:    vmovdqu %ymm1, (%rax) +; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] +; AVX2-NEXT:    vpavgb (%rdi), %ymm0, %ymm1 +; AVX2-NEXT:    vpavgb 32(%rdi), %ymm0, %ymm0  ; AVX2-NEXT:    vmovdqu %ymm0, (%rax) +; AVX2-NEXT:    vmovdqu %ymm1, (%rax)  ; AVX2-NEXT:    vzeroupper  ; AVX2-NEXT:    retq  ;  ; AVX512F-LABEL: avg_v64i8_const:  ; AVX512F:       # %bb.0: -; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8] -; AVX512F-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT:    vpaddd %zmm4, %zmm3, %zmm3 -; AVX512F-NEXT:    vpaddd %zmm4, %zmm2, %zmm2 -; AVX512F-NEXT:    vpaddd %zmm4, %zmm1, %zmm1 -; AVX512F-NEXT:    vpaddd %zmm4, %zmm0, %zmm0 -; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0 -; AVX512F-NEXT:    vpsrld $1, %zmm1, %zmm1 -; AVX512F-NEXT:    vpsrld $1, %zmm2, %zmm2 -; AVX512F-NEXT:    vpsrld $1, %zmm3, %zmm3 -; AVX512F-NEXT:    vpmovdb %zmm3, %xmm3 -; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2 -; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] +; AVX512F-NEXT:    vpavgb (%rdi), %ymm0, %ymm1 +; AVX512F-NEXT:    vpavgb 32(%rdi), %ymm0, %ymm0  ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax) -; AVX512F-NEXT:    vmovdqu %ymm2, (%rax) +; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)  ; AVX512F-NEXT:    vzeroupper  ; AVX512F-NEXT:    retq  ; @@ -2680,57 +1434,21 @@ define void @avg_v8i16_const(<8 x i16>* %a) nounwind {  define void @avg_v16i16_const(<16 x i16>* %a) nounwind {  ; SSE2-LABEL: avg_v16i16_const:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    movdqa (%rdi), %xmm3 -; SSE2-NEXT:    movdqa 16(%rdi), %xmm0 -; SSE2-NEXT:    pxor %xmm4, %xmm4 -; SSE2-NEXT:    movdqa %xmm0, %xmm1 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT:    movdqa %xmm3, %xmm2 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [5,6,7,8] -; SSE2-NEXT:    paddd %xmm4, %xmm3 -; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,3,4] -; SSE2-NEXT:    paddd %xmm5, %xmm2 -; SSE2-NEXT:    paddd %xmm4, %xmm0 -; SSE2-NEXT:    paddd %xmm5, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm0 -; SSE2-NEXT:    psrld $1, %xmm2 -; SSE2-NEXT:    psrld $1, %xmm3 -; SSE2-NEXT:    pslld $16, %xmm3 -; SSE2-NEXT:    psrad $16, %xmm3 -; SSE2-NEXT:    pslld $16, %xmm2 -; SSE2-NEXT:    psrad $16, %xmm2 -; SSE2-NEXT:    packssdw %xmm3, %xmm2 -; SSE2-NEXT:    pslld $16, %xmm0 -; SSE2-NEXT:    psrad $16, %xmm0 -; SSE2-NEXT:    pslld $16, %xmm1 -; SSE2-NEXT:    psrad $16, %xmm1 -; SSE2-NEXT:    packssdw %xmm0, %xmm1 +; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] +; SSE2-NEXT:    movdqa (%rdi), %xmm1 +; SSE2-NEXT:    pavgw %xmm0, %xmm1 +; SSE2-NEXT:    pavgw 16(%rdi), %xmm0 +; SSE2-NEXT:    movdqu %xmm0, (%rax)  ; SSE2-NEXT:    movdqu %xmm1, (%rax) -; SSE2-NEXT:    movdqu %xmm2, (%rax)  ; SSE2-NEXT:    retq  ;  ; AVX1-LABEL: avg_v16i16_const:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,2,3,4] -; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [5,6,7,8] -; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2 -; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1 -; AVX1-NEXT:    vpaddd %xmm5, %xmm0, %xmm0 -; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0 -; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm2 -; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7] +; AVX1-NEXT:    vpavgw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT:    vpavgw %xmm2, %xmm0, %xmm0  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    vmovups %ymm0, (%rax)  ; AVX1-NEXT:    vzeroupper @@ -2763,148 +1481,57 @@ define void @avg_v16i16_const(<16 x i16>* %a) nounwind {  define void @avg_v32i16_const(<32 x i16>* %a) nounwind {  ; SSE2-LABEL: avg_v32i16_const:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    movdqa (%rdi), %xmm7 -; SSE2-NEXT:    movdqa 16(%rdi), %xmm6 -; SSE2-NEXT:    movdqa 32(%rdi), %xmm4 -; SSE2-NEXT:    movdqa 48(%rdi), %xmm0 -; SSE2-NEXT:    pxor %xmm8, %xmm8 -; SSE2-NEXT:    movdqa %xmm0, %xmm1 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3] -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; SSE2-NEXT:    movdqa %xmm4, %xmm2 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; SSE2-NEXT:    movdqa %xmm6, %xmm3 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; SSE2-NEXT:    movdqa %xmm7, %xmm5 -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [5,6,7,8] -; SSE2-NEXT:    paddd %xmm8, %xmm7 -; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [1,2,3,4] -; SSE2-NEXT:    paddd %xmm9, %xmm5 -; SSE2-NEXT:    paddd %xmm8, %xmm6 -; SSE2-NEXT:    paddd %xmm9, %xmm3 -; SSE2-NEXT:    paddd %xmm8, %xmm4 -; SSE2-NEXT:    paddd %xmm9, %xmm2 -; SSE2-NEXT:    paddd %xmm8, %xmm0 -; SSE2-NEXT:    paddd %xmm9, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm1 -; SSE2-NEXT:    psrld $1, %xmm0 -; SSE2-NEXT:    psrld $1, %xmm2 -; SSE2-NEXT:    psrld $1, %xmm4 -; SSE2-NEXT:    psrld $1, %xmm3 -; SSE2-NEXT:    psrld $1, %xmm6 -; SSE2-NEXT:    psrld $1, %xmm5 -; SSE2-NEXT:    psrld $1, %xmm7 -; SSE2-NEXT:    pslld $16, %xmm7 -; SSE2-NEXT:    psrad $16, %xmm7 -; SSE2-NEXT:    pslld $16, %xmm5 -; SSE2-NEXT:    psrad $16, %xmm5 -; SSE2-NEXT:    packssdw %xmm7, %xmm5 -; SSE2-NEXT:    pslld $16, %xmm6 -; SSE2-NEXT:    psrad $16, %xmm6 -; SSE2-NEXT:    pslld $16, %xmm3 -; SSE2-NEXT:    psrad $16, %xmm3 -; SSE2-NEXT:    packssdw %xmm6, %xmm3 -; SSE2-NEXT:    pslld $16, %xmm4 -; SSE2-NEXT:    psrad $16, %xmm4 -; SSE2-NEXT:    pslld $16, %xmm2 -; SSE2-NEXT:    psrad $16, %xmm2 -; SSE2-NEXT:    packssdw %xmm4, %xmm2 -; SSE2-NEXT:    pslld $16, %xmm0 -; SSE2-NEXT:    psrad $16, %xmm0 -; SSE2-NEXT:    pslld $16, %xmm1 -; SSE2-NEXT:    psrad $16, %xmm1 -; SSE2-NEXT:    packssdw %xmm0, %xmm1 -; SSE2-NEXT:    movdqu %xmm1, (%rax) -; SSE2-NEXT:    movdqu %xmm2, (%rax) +; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] +; SSE2-NEXT:    movdqa (%rdi), %xmm1 +; SSE2-NEXT:    pavgw %xmm0, %xmm1 +; SSE2-NEXT:    movdqa 16(%rdi), %xmm2 +; SSE2-NEXT:    pavgw %xmm0, %xmm2 +; SSE2-NEXT:    movdqa 32(%rdi), %xmm3 +; SSE2-NEXT:    pavgw %xmm0, %xmm3 +; SSE2-NEXT:    pavgw 48(%rdi), %xmm0 +; SSE2-NEXT:    movdqu %xmm0, (%rax)  ; SSE2-NEXT:    movdqu %xmm3, (%rax) -; SSE2-NEXT:    movdqu %xmm5, (%rax) +; SSE2-NEXT:    movdqu %xmm2, (%rax) +; SSE2-NEXT:    movdqu %xmm1, (%rax)  ; SSE2-NEXT:    retq  ;  ; AVX1-LABEL: avg_v32i16_const:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,2,3,4] -; AVX1-NEXT:    vpaddd %xmm0, %xmm7, %xmm9 -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [5,6,7,8] -; AVX1-NEXT:    vpaddd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT:    vpaddd %xmm0, %xmm5, %xmm5 -; AVX1-NEXT:    vpaddd %xmm7, %xmm4, %xmm4 -; AVX1-NEXT:    vpaddd %xmm0, %xmm3, %xmm3 -; AVX1-NEXT:    vpaddd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT:    vpaddd %xmm7, %xmm8, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm2 -; AVX1-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm2 -; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm3 -; AVX1-NEXT:    vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm3 -; AVX1-NEXT:    vpsrld $1, %xmm9, %xmm4 -; AVX1-NEXT:    vpackusdw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7] +; AVX1-NEXT:    vpavgw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT:    vpavgw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vpavgw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT:    vpavgw %xmm3, %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT:    vmovups %ymm1, (%rax)  ; AVX1-NEXT:    vmovups %ymm0, (%rax) -; AVX1-NEXT:    vmovups %ymm2, (%rax)  ; AVX1-NEXT:    vzeroupper  ; AVX1-NEXT:    retq  ;  ; AVX2-LABEL: avg_v32i16_const:  ; AVX2:       # %bb.0: -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,2,3,4,5,6,7,8] -; AVX2-NEXT:    vpaddd %ymm4, %ymm3, %ymm3 -; AVX2-NEXT:    vpaddd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT:    vpaddd %ymm4, %ymm1, %ymm1 -; AVX2-NEXT:    vpaddd %ymm4, %ymm0, %ymm0 -; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0 -; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1 -; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4 -; AVX2-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2 -; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1 -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX2-NEXT:    # ymm0 = mem[0,1,0,1] +; AVX2-NEXT:    vpavgw (%rdi), %ymm0, %ymm1 +; AVX2-NEXT:    vpavgw 32(%rdi), %ymm0, %ymm0  ; AVX2-NEXT:    vmovdqu %ymm0, (%rax) -; AVX2-NEXT:    vmovdqu %ymm2, (%rax) +; AVX2-NEXT:    vmovdqu %ymm1, (%rax)  ; AVX2-NEXT:    vzeroupper  ; AVX2-NEXT:    retq  ;  ; AVX512F-LABEL: avg_v32i16_const:  ; AVX512F:       # %bb.0: -; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8] -; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512F-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0 -; AVX512F-NEXT:    vpsrld $1, %zmm1, %zmm1 -; AVX512F-NEXT:    vpmovdw %zmm1, (%rax) -; AVX512F-NEXT:    vpmovdw %zmm0, (%rax) +; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512F-NEXT:    # ymm0 = mem[0,1,0,1] +; AVX512F-NEXT:    vpavgw (%rdi), %ymm0, %ymm1 +; AVX512F-NEXT:    vpavgw 32(%rdi), %ymm0, %ymm0 +; AVX512F-NEXT:    vmovdqu %ymm0, (%rax) +; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)  ; AVX512F-NEXT:    vzeroupper  ; AVX512F-NEXT:    retq  ; @@ -2946,77 +1573,16 @@ define <16 x i8> @avg_v16i8_3(<16 x i8> %a, <16 x i8> %b) nounwind {  define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind {  ; SSE2-LABEL: avg_v32i8_3:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    pxor %xmm5, %xmm5 -; SSE2-NEXT:    movdqa %xmm0, %xmm6 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE2-NEXT:    movdqa %xmm1, %xmm7 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE2-NEXT:    movdqa %xmm2, %xmm4 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] -; SSE2-NEXT:    paddw %xmm6, %xmm4 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] -; SSE2-NEXT:    paddw %xmm2, %xmm0 -; SSE2-NEXT:    movdqa %xmm3, %xmm2 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] -; SSE2-NEXT:    paddw %xmm7, %xmm2 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE2-NEXT:    paddw %xmm3, %xmm1 -; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT:    psubw %xmm3, %xmm4 -; SSE2-NEXT:    psubw %xmm3, %xmm0 -; SSE2-NEXT:    psubw %xmm3, %xmm2 -; SSE2-NEXT:    psubw %xmm3, %xmm1 -; SSE2-NEXT:    psrlw $1, %xmm1 -; SSE2-NEXT:    psrlw $1, %xmm2 -; SSE2-NEXT:    psrlw $1, %xmm0 -; SSE2-NEXT:    psrlw $1, %xmm4 -; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT:    pand %xmm3, %xmm4 -; SSE2-NEXT:    pand %xmm3, %xmm0 -; SSE2-NEXT:    packuswb %xmm4, %xmm0 -; SSE2-NEXT:    pand %xmm3, %xmm2 -; SSE2-NEXT:    pand %xmm3, %xmm1 -; SSE2-NEXT:    packuswb %xmm2, %xmm1 +; SSE2-NEXT:    pavgb %xmm2, %xmm0 +; SSE2-NEXT:    pavgb %xmm3, %xmm1  ; SSE2-NEXT:    retq  ;  ; AVX1-LABEL: avg_v32i8_3:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-NEXT:    vpaddw %xmm6, %xmm3, %xmm3 -; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-NEXT:    vpaddw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT:    vpaddw %xmm5, %xmm4, %xmm4 -; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm3 -; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT:    vpsubw %xmm1, %xmm4, %xmm4 -; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0 -; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm1 -; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2 -; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm3 -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT:    vpavgb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT:    vpavgb %xmm1, %xmm0, %xmm0  ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; @@ -3041,203 +1607,36 @@ define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind {  define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind {  ; SSE2-LABEL: avg_v64i8_3:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    pxor %xmm9, %xmm9 -; SSE2-NEXT:    movdqa %xmm0, %xmm10 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] -; SSE2-NEXT:    movdqa %xmm1, %xmm11 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] -; SSE2-NEXT:    movdqa %xmm2, %xmm12 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] -; SSE2-NEXT:    movdqa %xmm3, %xmm13 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE2-NEXT:    movdqa %xmm4, %xmm8 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] -; SSE2-NEXT:    paddw %xmm10, %xmm8 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] -; SSE2-NEXT:    paddw %xmm4, %xmm0 -; SSE2-NEXT:    movdqa %xmm5, %xmm4 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] -; SSE2-NEXT:    paddw %xmm11, %xmm4 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSE2-NEXT:    paddw %xmm5, %xmm1 -; SSE2-NEXT:    movdqa %xmm6, %xmm5 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] -; SSE2-NEXT:    paddw %xmm12, %xmm5 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] -; SSE2-NEXT:    paddw %xmm6, %xmm2 -; SSE2-NEXT:    movdqa %xmm7, %xmm6 -; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] -; SSE2-NEXT:    paddw %xmm13, %xmm6 -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; SSE2-NEXT:    paddw %xmm7, %xmm3 -; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7 -; SSE2-NEXT:    psubw %xmm7, %xmm8 -; SSE2-NEXT:    psubw %xmm7, %xmm0 -; SSE2-NEXT:    psubw %xmm7, %xmm4 -; SSE2-NEXT:    psubw %xmm7, %xmm1 -; SSE2-NEXT:    psubw %xmm7, %xmm5 -; SSE2-NEXT:    psubw %xmm7, %xmm2 -; SSE2-NEXT:    psubw %xmm7, %xmm6 -; SSE2-NEXT:    psubw %xmm7, %xmm3 -; SSE2-NEXT:    psrlw $1, %xmm3 -; SSE2-NEXT:    psrlw $1, %xmm6 -; SSE2-NEXT:    psrlw $1, %xmm2 -; SSE2-NEXT:    psrlw $1, %xmm5 -; SSE2-NEXT:    psrlw $1, %xmm1 -; SSE2-NEXT:    psrlw $1, %xmm4 -; SSE2-NEXT:    psrlw $1, %xmm0 -; SSE2-NEXT:    psrlw $1, %xmm8 -; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT:    pand %xmm7, %xmm8 -; SSE2-NEXT:    pand %xmm7, %xmm0 -; SSE2-NEXT:    packuswb %xmm8, %xmm0 -; SSE2-NEXT:    pand %xmm7, %xmm4 -; SSE2-NEXT:    pand %xmm7, %xmm1 -; SSE2-NEXT:    packuswb %xmm4, %xmm1 -; SSE2-NEXT:    pand %xmm7, %xmm5 -; SSE2-NEXT:    pand %xmm7, %xmm2 -; SSE2-NEXT:    packuswb %xmm5, %xmm2 -; SSE2-NEXT:    pand %xmm7, %xmm6 -; SSE2-NEXT:    pand %xmm7, %xmm3 -; SSE2-NEXT:    packuswb %xmm6, %xmm3 +; SSE2-NEXT:    pavgb %xmm4, %xmm0 +; SSE2-NEXT:    pavgb %xmm5, %xmm1 +; SSE2-NEXT:    pavgb %xmm6, %xmm2 +; SSE2-NEXT:    pavgb %xmm7, %xmm3  ; SSE2-NEXT:    retq  ;  ; AVX1-LABEL: avg_v64i8_3:  ; AVX1:       # %bb.0: -; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero -; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,3,0,1] -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm11 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT:    vpaddw %xmm7, %xmm5, %xmm12 -; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT:    vpaddw %xmm1, %xmm4, %xmm13 -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT:    vpaddw %xmm4, %xmm6, %xmm14 -; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT:    vpaddw %xmm2, %xmm0, %xmm15 -; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2 -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT:    vpaddw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT:    vpaddw %xmm2, %xmm11, %xmm2 -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT:    vpaddw %xmm7, %xmm9, %xmm7 -; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT:    vpaddw %xmm3, %xmm10, %xmm3 -; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX1-NEXT:    vpsubw %xmm5, %xmm12, %xmm8 -; AVX1-NEXT:    vpsubw %xmm5, %xmm13, %xmm4 -; AVX1-NEXT:    vpsubw %xmm5, %xmm14, %xmm0 -; AVX1-NEXT:    vpsubw %xmm5, %xmm15, %xmm1 -; AVX1-NEXT:    vpsubw %xmm5, %xmm6, %xmm6 -; AVX1-NEXT:    vpsubw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT:    vpsubw %xmm5, %xmm7, %xmm7 -; AVX1-NEXT:    vpsubw %xmm5, %xmm3, %xmm3 -; AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm9 -; AVX1-NEXT:    vpsrlw $1, %xmm7, %xmm5 -; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm2 -; AVX1-NEXT:    vpsrlw $1, %xmm6, %xmm6 -; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1 -; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm0 -; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm4 -; AVX1-NEXT:    vpsrlw $1, %xmm8, %xmm7 -; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX1-NEXT:    vpshufb %xmm3, %xmm7, %xmm7 -; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm4 -; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm7[0],xmm4[0] -; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT:    vpavgb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT:    vpavgb %xmm2, %xmm0, %xmm0  ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0 -; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm1 -; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT:    vpshufb %xmm3, %xmm5, %xmm2 -; AVX1-NEXT:    vpshufb %xmm3, %xmm9, %xmm3 -; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT:    vpavgb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT:    vpavgb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1  ; AVX1-NEXT:    retq  ;  ; AVX2-LABEL: avg_v64i8_3:  ; AVX2:       # %bb.0: -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero -; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm5 -; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero -; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm6 -; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero -; AVX2-NEXT:    vpaddw %ymm6, %ymm4, %ymm4 -; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm2 -; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero -; AVX2-NEXT:    vpaddw %ymm2, %ymm5, %ymm2 -; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero -; AVX2-NEXT:    vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3 -; AVX2-NEXT:    vpsubw %ymm3, %ymm4, %ymm4 -; AVX2-NEXT:    vpsubw %ymm3, %ymm0, %ymm0 -; AVX2-NEXT:    vpsubw %ymm3, %ymm2, %ymm2 -; AVX2-NEXT:    vpsubw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT:    vpsrlw $1, %ymm1, %ymm1 -; AVX2-NEXT:    vpsrlw $1, %ymm2, %ymm2 -; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm0 -; AVX2-NEXT:    vpsrlw $1, %ymm4, %ymm3 -; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4 -; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm0 -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2 -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3 -; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3 -; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm1 -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT:    vpavgb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT:    vpavgb %ymm3, %ymm1, %ymm1  ; AVX2-NEXT:    retq  ;  ; AVX512F-LABEL: avg_v64i8_3:  ; AVX512F:       # %bb.0: -; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm4 -; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm5 -; AVX512F-NEXT:    vextracti128 $1, %ymm3, %xmm6 -; AVX512F-NEXT:    vpavgb %xmm6, %xmm4, %xmm4 -; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm6 -; AVX512F-NEXT:    vpavgb %xmm6, %xmm5, %xmm5 -; AVX512F-NEXT:    vpavgb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0 -; AVX512F-NEXT:    vpavgb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1 +; AVX512F-NEXT:    vpavgb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT:    vpavgb %ymm3, %ymm1, %ymm1  ; AVX512F-NEXT:    retq  ;  ; AVX512BW-LABEL: avg_v64i8_3: diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll index 6e6d61f37d2e..248462d0de51 100644 --- a/test/CodeGen/X86/avx512-calling-conv.ll +++ b/test/CodeGen/X86/avx512-calling-conv.ll @@ -17,78 +17,40 @@ define <16 x i1> @test1() {  }  define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) { -; KNL-LABEL: test2: -; KNL:       ## %bb.0: -; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1 -; KNL-NEXT:    vpslld $31, %zmm1, %zmm1 -; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0 -; KNL-NEXT:    vpslld $31, %zmm0, %zmm0 -; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1 -; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1} -; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT:    vpmovdb %zmm0, %xmm0 -; KNL-NEXT:    retq -; -; SKX-LABEL: test2: -; SKX:       ## %bb.0: -; SKX-NEXT:    vpsllw $7, %xmm1, %xmm1 -; SKX-NEXT:    vpmovb2m %xmm1, %k0 -; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0 -; SKX-NEXT:    vpmovb2m %xmm0, %k1 -; SKX-NEXT:    kandw %k0, %k1, %k0 -; SKX-NEXT:    vpmovm2b %k0, %xmm0 -; SKX-NEXT:    retq +; ALL_X64-LABEL: test2: +; ALL_X64:       ## %bb.0: +; ALL_X64-NEXT:    vpand %xmm1, %xmm0, %xmm0 +; ALL_X64-NEXT:    vpsllw $7, %xmm0, %xmm0 +; ALL_X64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0 +; ALL_X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; ALL_X64-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0 +; ALL_X64-NEXT:    retq  ;  ; KNL_X32-LABEL: test2:  ; KNL_X32:       ## %bb.0: -; KNL_X32-NEXT:    vpmovsxbd %xmm1, %zmm1 -; KNL_X32-NEXT:    vpslld $31, %zmm1, %zmm1 -; KNL_X32-NEXT:    vpmovsxbd %xmm0, %zmm0 -; KNL_X32-NEXT:    vpslld $31, %zmm0, %zmm0 -; KNL_X32-NEXT:    vptestmd %zmm0, %zmm0, %k1 -; KNL_X32-NEXT:    vptestmd %zmm1, %zmm1, %k1 {%k1} -; KNL_X32-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL_X32-NEXT:    vpmovdb %zmm0, %xmm0 +; KNL_X32-NEXT:    vpand %xmm1, %xmm0, %xmm0 +; KNL_X32-NEXT:    vpsllw $7, %xmm0, %xmm0 +; KNL_X32-NEXT:    vpand LCPI1_0, %xmm0, %xmm0 +; KNL_X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; KNL_X32-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0  ; KNL_X32-NEXT:    retl    %c = and <16 x i1>%a, %b    ret <16 x i1> %c  }  define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) { -; KNL-LABEL: test3: -; KNL:       ## %bb.0: -; KNL-NEXT:    vpmovsxwq %xmm1, %zmm1 -; KNL-NEXT:    vpsllq $63, %zmm1, %zmm1 -; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0 -; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0 -; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1 -; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1 {%k1} -; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT:    vpmovdw %zmm0, %ymm0 -; KNL-NEXT:    ## kill: def %xmm0 killed %xmm0 killed %ymm0 -; KNL-NEXT:    retq -; -; SKX-LABEL: test3: -; SKX:       ## %bb.0: -; SKX-NEXT:    vpsllw $15, %xmm1, %xmm1 -; SKX-NEXT:    vpmovw2m %xmm1, %k0 -; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT:    vpmovw2m %xmm0, %k1 -; SKX-NEXT:    kandb %k0, %k1, %k0 -; SKX-NEXT:    vpmovm2w %k0, %xmm0 -; SKX-NEXT:    retq +; ALL_X64-LABEL: test3: +; ALL_X64:       ## %bb.0: +; ALL_X64-NEXT:    vpand %xmm1, %xmm0, %xmm0 +; ALL_X64-NEXT:    vpsllw $15, %xmm0, %xmm0 +; ALL_X64-NEXT:    vpsraw $15, %xmm0, %xmm0 +; ALL_X64-NEXT:    retq  ;  ; KNL_X32-LABEL: test3:  ; KNL_X32:       ## %bb.0: -; KNL_X32-NEXT:    vpmovsxwq %xmm1, %zmm1 -; KNL_X32-NEXT:    vpsllq $63, %zmm1, %zmm1 -; KNL_X32-NEXT:    vpmovsxwq %xmm0, %zmm0 -; KNL_X32-NEXT:    vpsllq $63, %zmm0, %zmm0 -; KNL_X32-NEXT:    vptestmq %zmm0, %zmm0, %k1 -; KNL_X32-NEXT:    vptestmq %zmm1, %zmm1, %k1 {%k1} -; KNL_X32-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL_X32-NEXT:    vpmovdw %zmm0, %ymm0 -; KNL_X32-NEXT:    ## kill: def %xmm0 killed %xmm0 killed %ymm0 +; KNL_X32-NEXT:    vpand %xmm1, %xmm0, %xmm0 +; KNL_X32-NEXT:    vpsllw $15, %xmm0, %xmm0 +; KNL_X32-NEXT:    vpsraw $15, %xmm0, %xmm0  ; KNL_X32-NEXT:    retl    %c = and <8 x i1>%a, %b    ret <8 x i1> %c @@ -102,11 +64,9 @@ define <4 x i1> @test4(<4 x i1>%a, <4 x i1>%b) {  ;  ; SKX-LABEL: test4:  ; SKX:       ## %bb.0: -; SKX-NEXT:    vpslld $31, %xmm1, %xmm1 +; SKX-NEXT:    vpand %xmm1, %xmm0, %xmm0  ; SKX-NEXT:    vpslld $31, %xmm0, %xmm0 -; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1 -; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k0 {%k1} -; SKX-NEXT:    vpmovm2d %k0, %xmm0 +; SKX-NEXT:    vpsrad $31, %xmm0, %xmm0  ; SKX-NEXT:    retq  ;  ; KNL_X32-LABEL: test4: diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll index 97beff63811a..8c7941591217 100644 --- a/test/CodeGen/X86/avx512-ext.ll +++ b/test/CodeGen/X86/avx512-ext.ll @@ -1366,21 +1366,12 @@ define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {  }  define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) { -; KNL-LABEL: trunc_4i32_to_4i1: -; KNL:       # %bb.0: -; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0 -; KNL-NEXT:    vpslld $31, %xmm0, %xmm0 -; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0 -; KNL-NEXT:    retq -; -; SKX-LABEL: trunc_4i32_to_4i1: -; SKX:       # %bb.0: -; SKX-NEXT:    vpslld $31, %xmm0, %xmm0 -; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1 -; SKX-NEXT:    vpslld $31, %xmm1, %xmm0 -; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k0 {%k1} -; SKX-NEXT:    vpmovm2d %k0, %xmm0 -; SKX-NEXT:    retq +; ALL-LABEL: trunc_4i32_to_4i1: +; ALL:       # %bb.0: +; ALL-NEXT:    vpand %xmm1, %xmm0, %xmm0 +; ALL-NEXT:    vpslld $31, %xmm0, %xmm0 +; ALL-NEXT:    vpsrad $31, %xmm0, %xmm0 +; ALL-NEXT:    retq    %mask_a = trunc <4 x i32>%a to <4 x i1>    %mask_b = trunc <4 x i32>%b to <4 x i1>    %a_and_b = and <4 x i1>%mask_a, %mask_b diff --git a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll index 34ea468aebee..e1ed8ea98a1c 100644 --- a/test/CodeGen/X86/avx512-extract-subvector-load-store.ll +++ b/test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -249,9 +249,9 @@ define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    kmovd (%rdi), %k0  ; AVX512-NEXT:    kshiftrd $16, %k0, %k0 -; AVX512-NEXT:    vpmovm2q %k0, %zmm2 -; AVX512-NEXT:    vpbroadcastq %xmm2, %zmm2 -; AVX512-NEXT:    vpmovq2m %zmm2, %k1 +; AVX512-NEXT:    vpmovm2d %k0, %ymm2 +; AVX512-NEXT:    vpbroadcastd %xmm2, %ymm2 +; AVX512-NEXT:    vpmovd2m %ymm2, %k1  ; AVX512-NEXT:    vmovaps %ymm0, %ymm1 {%k1}  ; AVX512-NEXT:    vmovaps %ymm1, (%rsi)  ; AVX512-NEXT:    vzeroupper @@ -261,10 +261,11 @@ define void @load_v32i1_broadcast_16_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl  ; AVX512NOTDQ:       # %bb.0:  ; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0  ; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1 -; AVX512NOTDQ-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %zmm2 -; AVX512NOTDQ-NEXT:    vpsllq $63, %zmm2, %zmm2 -; AVX512NOTDQ-NEXT:    vptestmq %zmm2, %zmm2, %k1 +; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z} +; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %ymm2 +; AVX512NOTDQ-NEXT:    vpslld $31, %ymm2, %ymm2 +; AVX512NOTDQ-NEXT:    vptestmd %ymm2, %ymm2, %k1  ; AVX512NOTDQ-NEXT:    vmovaps %ymm0, %ymm1 {%k1}  ; AVX512NOTDQ-NEXT:    vmovaps %ymm1, (%rsi)  ; AVX512NOTDQ-NEXT:    vzeroupper @@ -340,10 +341,10 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    kmovd (%rdi), %k0  ; AVX512-NEXT:    kshiftrd $24, %k0, %k0 -; AVX512-NEXT:    vpmovm2q %k0, %zmm2 -; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7] -; AVX512-NEXT:    vpermq %zmm2, %zmm3, %zmm2 -; AVX512-NEXT:    vpmovq2m %zmm2, %k1 +; AVX512-NEXT:    vpmovm2d %k0, %ymm2 +; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7] +; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512-NEXT:    vpmovd2m %ymm2, %k1  ; AVX512-NEXT:    vmovaps %ymm0, %ymm1 {%k1}  ; AVX512-NEXT:    vmovaps %ymm1, (%rsi)  ; AVX512-NEXT:    vzeroupper @@ -353,11 +354,12 @@ define void @load_v32i1_broadcast_31_v8i1(<32 x i1>* %a0,<8 x float> %a1,<8 x fl  ; AVX512NOTDQ:       # %bb.0:  ; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0  ; AVX512NOTDQ-NEXT:    kshiftrd $24, %k0, %k1 -; AVX512NOTDQ-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512NOTDQ-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7] -; AVX512NOTDQ-NEXT:    vpermq %zmm2, %zmm3, %zmm2 -; AVX512NOTDQ-NEXT:    vpsllq $63, %zmm2, %zmm2 -; AVX512NOTDQ-NEXT:    vptestmq %zmm2, %zmm2, %k1 +; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z} +; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7] +; AVX512NOTDQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512NOTDQ-NEXT:    vpslld $31, %ymm2, %ymm2 +; AVX512NOTDQ-NEXT:    vptestmd %ymm2, %ymm2, %k1  ; AVX512NOTDQ-NEXT:    vmovaps %ymm0, %ymm1 {%k1}  ; AVX512NOTDQ-NEXT:    vmovaps %ymm1, (%rsi)  ; AVX512NOTDQ-NEXT:    vzeroupper @@ -433,9 +435,9 @@ define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    kmovq (%rdi), %k0  ; AVX512-NEXT:    kshiftrq $32, %k0, %k0 -; AVX512-NEXT:    vpmovm2q %k0, %zmm2 -; AVX512-NEXT:    vpbroadcastq %xmm2, %zmm2 -; AVX512-NEXT:    vpmovq2m %zmm2, %k1 +; AVX512-NEXT:    vpmovm2d %k0, %ymm2 +; AVX512-NEXT:    vpbroadcastd %xmm2, %ymm2 +; AVX512-NEXT:    vpmovd2m %ymm2, %k1  ; AVX512-NEXT:    vmovaps %ymm0, %ymm1 {%k1}  ; AVX512-NEXT:    vmovaps %ymm1, (%rsi)  ; AVX512-NEXT:    vzeroupper @@ -445,10 +447,11 @@ define void @load_v64i1_broadcast_32_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl  ; AVX512NOTDQ:       # %bb.0:  ; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0  ; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1 -; AVX512NOTDQ-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm2, %zmm2 -; AVX512NOTDQ-NEXT:    vpsllq $63, %zmm2, %zmm2 -; AVX512NOTDQ-NEXT:    vptestmq %zmm2, %zmm2, %k1 +; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z} +; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm2, %ymm2 +; AVX512NOTDQ-NEXT:    vpslld $31, %ymm2, %ymm2 +; AVX512NOTDQ-NEXT:    vptestmd %ymm2, %ymm2, %k1  ; AVX512NOTDQ-NEXT:    vmovaps %ymm0, %ymm1 {%k1}  ; AVX512NOTDQ-NEXT:    vmovaps %ymm1, (%rsi)  ; AVX512NOTDQ-NEXT:    vzeroupper @@ -555,10 +558,10 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    kmovq (%rdi), %k0  ; AVX512-NEXT:    kshiftrq $56, %k0, %k0 -; AVX512-NEXT:    vpmovm2q %k0, %zmm2 -; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7] -; AVX512-NEXT:    vpermq %zmm2, %zmm3, %zmm2 -; AVX512-NEXT:    vpmovq2m %zmm2, %k1 +; AVX512-NEXT:    vpmovm2d %k0, %ymm2 +; AVX512-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7] +; AVX512-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512-NEXT:    vpmovd2m %ymm2, %k1  ; AVX512-NEXT:    vmovaps %ymm0, %ymm1 {%k1}  ; AVX512-NEXT:    vmovaps %ymm1, (%rsi)  ; AVX512-NEXT:    vzeroupper @@ -568,11 +571,12 @@ define void @load_v64i1_broadcast_63_v8i1(<64 x i1>* %a0,<8 x float> %a1,<8 x fl  ; AVX512NOTDQ:       # %bb.0:  ; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0  ; AVX512NOTDQ-NEXT:    kshiftrq $56, %k0, %k1 -; AVX512NOTDQ-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} -; AVX512NOTDQ-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7] -; AVX512NOTDQ-NEXT:    vpermq %zmm2, %zmm3, %zmm2 -; AVX512NOTDQ-NEXT:    vpsllq $63, %zmm2, %zmm2 -; AVX512NOTDQ-NEXT:    vptestmq %zmm2, %zmm2, %k1 +; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm2, %ymm2 {%k1} {z} +; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7] +; AVX512NOTDQ-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512NOTDQ-NEXT:    vpslld $31, %ymm2, %ymm2 +; AVX512NOTDQ-NEXT:    vptestmd %ymm2, %ymm2, %k1  ; AVX512NOTDQ-NEXT:    vmovaps %ymm0, %ymm1 {%k1}  ; AVX512NOTDQ-NEXT:    vmovaps %ymm1, (%rsi)  ; AVX512NOTDQ-NEXT:    vzeroupper @@ -1054,9 +1058,9 @@ define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    kmovd (%rdi), %k0  ; AVX512-NEXT:    kshiftrd $16, %k0, %k0 -; AVX512-NEXT:    vpmovm2q %k0, %zmm0 -; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0 -; AVX512-NEXT:    vpmovq2m %zmm0, %k0 +; AVX512-NEXT:    vpmovm2d %k0, %ymm0 +; AVX512-NEXT:    vpbroadcastd %xmm0, %ymm0 +; AVX512-NEXT:    vpmovd2m %ymm0, %k0  ; AVX512-NEXT:    kmovb %k0, (%rsi)  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq @@ -1065,10 +1069,11 @@ define void @load_v32i1_broadcast_16_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {  ; AVX512NOTDQ:       # %bb.0:  ; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0  ; AVX512NOTDQ-NEXT:    kshiftrd $16, %k0, %k1 -; AVX512NOTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %zmm0 -; AVX512NOTDQ-NEXT:    vpsllq $63, %zmm0, %zmm0 -; AVX512NOTDQ-NEXT:    vptestmq %zmm0, %zmm0, %k0 +; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %ymm0 +; AVX512NOTDQ-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX512NOTDQ-NEXT:    vptestmd %ymm0, %ymm0, %k0  ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax  ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)  ; AVX512NOTDQ-NEXT:    vzeroupper @@ -1159,10 +1164,10 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    kmovd (%rdi), %k0  ; AVX512-NEXT:    kshiftrd $24, %k0, %k0 -; AVX512-NEXT:    vpmovm2q %k0, %zmm0 -; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7] -; AVX512-NEXT:    vpermq %zmm0, %zmm1, %zmm0 -; AVX512-NEXT:    vpmovq2m %zmm0, %k0 +; AVX512-NEXT:    vpmovm2d %k0, %ymm0 +; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7] +; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512-NEXT:    vpmovd2m %ymm0, %k0  ; AVX512-NEXT:    kmovb %k0, (%rsi)  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq @@ -1171,11 +1176,12 @@ define void @load_v32i1_broadcast_31_v8i1_store(<32 x i1>* %a0,<8 x i1>* %a1) {  ; AVX512NOTDQ:       # %bb.0:  ; AVX512NOTDQ-NEXT:    kmovd (%rdi), %k0  ; AVX512NOTDQ-NEXT:    kshiftrd $24, %k0, %k1 -; AVX512NOTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512NOTDQ-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7] -; AVX512NOTDQ-NEXT:    vpermq %zmm0, %zmm1, %zmm0 -; AVX512NOTDQ-NEXT:    vpsllq $63, %zmm0, %zmm0 -; AVX512NOTDQ-NEXT:    vptestmq %zmm0, %zmm0, %k0 +; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7] +; AVX512NOTDQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512NOTDQ-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX512NOTDQ-NEXT:    vptestmd %ymm0, %ymm0, %k0  ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax  ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)  ; AVX512NOTDQ-NEXT:    vzeroupper @@ -1266,9 +1272,9 @@ define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    kmovq (%rdi), %k0  ; AVX512-NEXT:    kshiftrq $32, %k0, %k0 -; AVX512-NEXT:    vpmovm2q %k0, %zmm0 -; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0 -; AVX512-NEXT:    vpmovq2m %zmm0, %k0 +; AVX512-NEXT:    vpmovm2d %k0, %ymm0 +; AVX512-NEXT:    vpbroadcastd %xmm0, %ymm0 +; AVX512-NEXT:    vpmovd2m %ymm0, %k0  ; AVX512-NEXT:    kmovb %k0, (%rsi)  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq @@ -1277,10 +1283,11 @@ define void @load_v64i1_broadcast_32_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {  ; AVX512NOTDQ:       # %bb.0:  ; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0  ; AVX512NOTDQ-NEXT:    kshiftrq $32, %k0, %k1 -; AVX512NOTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512NOTDQ-NEXT:    vpbroadcastq %xmm0, %zmm0 -; AVX512NOTDQ-NEXT:    vpsllq $63, %zmm0, %zmm0 -; AVX512NOTDQ-NEXT:    vptestmq %zmm0, %zmm0, %k0 +; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512NOTDQ-NEXT:    vpbroadcastd %xmm0, %ymm0 +; AVX512NOTDQ-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX512NOTDQ-NEXT:    vptestmd %ymm0, %ymm0, %k0  ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax  ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)  ; AVX512NOTDQ-NEXT:    vzeroupper @@ -1399,10 +1406,10 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    kmovq (%rdi), %k0  ; AVX512-NEXT:    kshiftrq $56, %k0, %k0 -; AVX512-NEXT:    vpmovm2q %k0, %zmm0 -; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7] -; AVX512-NEXT:    vpermq %zmm0, %zmm1, %zmm0 -; AVX512-NEXT:    vpmovq2m %zmm0, %k0 +; AVX512-NEXT:    vpmovm2d %k0, %ymm0 +; AVX512-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7] +; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512-NEXT:    vpmovd2m %ymm0, %k0  ; AVX512-NEXT:    kmovb %k0, (%rsi)  ; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq @@ -1411,11 +1418,12 @@ define void @load_v64i1_broadcast_63_v8i1_store(<64 x i1>* %a0,<8 x i1>* %a1) {  ; AVX512NOTDQ:       # %bb.0:  ; AVX512NOTDQ-NEXT:    kmovq (%rdi), %k0  ; AVX512NOTDQ-NEXT:    kshiftrq $56, %k0, %k1 -; AVX512NOTDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512NOTDQ-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7] -; AVX512NOTDQ-NEXT:    vpermq %zmm0, %zmm1, %zmm0 -; AVX512NOTDQ-NEXT:    vpsllq $63, %zmm0, %zmm0 -; AVX512NOTDQ-NEXT:    vptestmq %zmm0, %zmm0, %k0 +; AVX512NOTDQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512NOTDQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512NOTDQ-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7] +; AVX512NOTDQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512NOTDQ-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX512NOTDQ-NEXT:    vptestmd %ymm0, %ymm0, %k0  ; AVX512NOTDQ-NEXT:    kmovd %k0, %eax  ; AVX512NOTDQ-NEXT:    movb %al, (%rsi)  ; AVX512NOTDQ-NEXT:    vzeroupper diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index 7e0b981b2c6a..7477e05f0c7f 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -793,11 +793,10 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>  ; KNL-NEXT:    cmpl %esi, %edi  ; KNL-NEXT:    setb %al  ; KNL-NEXT:    vpcmpltud %zmm2, %zmm0, %k1 -; KNL-NEXT:    movl {{.*}}(%rip), %ecx -; KNL-NEXT:    vpbroadcastd %ecx, %zmm0 {%k1} {z} +; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; KNL-NEXT:    vpmovdb %zmm0, %xmm0  ; KNL-NEXT:    vpcmpltud %zmm3, %zmm1, %k1 -; KNL-NEXT:    vpbroadcastd %ecx, %zmm1 {%k1} {z} +; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}  ; KNL-NEXT:    vpmovdb %zmm1, %xmm1  ; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0  ; KNL-NEXT:    vpsllw $7, %ymm0, %ymm0 @@ -1432,8 +1431,7 @@ define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {  ; CHECK-NEXT:    ## kill: def %edi killed %edi def %rdi  ; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)  ; CHECK-NEXT:    andl $15, %edi -; CHECK-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax -; CHECK-NEXT:    movb (%rdi,%rax), %al +; CHECK-NEXT:    movb -24(%rsp,%rdi), %al  ; CHECK-NEXT:    retq    %t2 = extractelement <16 x i8> %t1, i32 %index    ret i8 %t2 @@ -1452,8 +1450,7 @@ define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) {  ; CHECK-NEXT:    ## kill: def %edi killed %edi def %rdi  ; CHECK-NEXT:    vmovaps %ymm0, (%rsp)  ; CHECK-NEXT:    andl $31, %edi -; CHECK-NEXT:    movq %rsp, %rax -; CHECK-NEXT:    movb (%rdi,%rax), %al +; CHECK-NEXT:    movb (%rsp,%rdi), %al  ; CHECK-NEXT:    movq %rbp, %rsp  ; CHECK-NEXT:    popq %rbp  ; CHECK-NEXT:    vzeroupper @@ -1477,8 +1474,7 @@ define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {  ; KNL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)  ; KNL-NEXT:    vmovaps %ymm0, (%rsp)  ; KNL-NEXT:    andl $63, %edi -; KNL-NEXT:    movq %rsp, %rax -; KNL-NEXT:    movb (%rdi,%rax), %al +; KNL-NEXT:    movb (%rsp,%rdi), %al  ; KNL-NEXT:    movq %rbp, %rsp  ; KNL-NEXT:    popq %rbp  ; KNL-NEXT:    vzeroupper @@ -1496,8 +1492,7 @@ define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {  ; SKX-NEXT:    ## kill: def %edi killed %edi def %rdi  ; SKX-NEXT:    vmovaps %zmm0, (%rsp)  ; SKX-NEXT:    andl $63, %edi -; SKX-NEXT:    movq %rsp, %rax -; SKX-NEXT:    movb (%rdi,%rax), %al +; SKX-NEXT:    movb (%rsp,%rdi), %al  ; SKX-NEXT:    movq %rbp, %rsp  ; SKX-NEXT:    popq %rbp  ; SKX-NEXT:    vzeroupper @@ -1522,8 +1517,7 @@ define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index)  ; KNL-NEXT:    vmovaps %ymm0, (%rsp)  ; KNL-NEXT:    movzbl %dil, %eax  ; KNL-NEXT:    andl $63, %eax -; KNL-NEXT:    movq %rsp, %rcx -; KNL-NEXT:    movb (%rax,%rcx), %al +; KNL-NEXT:    movb (%rsp,%rax), %al  ; KNL-NEXT:    movq %rbp, %rsp  ; KNL-NEXT:    popq %rbp  ; KNL-NEXT:    vzeroupper @@ -1542,8 +1536,7 @@ define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index)  ; SKX-NEXT:    vmovaps %zmm0, (%rsp)  ; SKX-NEXT:    movzbl %dil, %eax  ; SKX-NEXT:    andl $63, %eax -; SKX-NEXT:    movq %rsp, %rcx -; SKX-NEXT:    movb (%rax,%rcx), %al +; SKX-NEXT:    movb (%rsp,%rax), %al  ; SKX-NEXT:    movq %rbp, %rsp  ; SKX-NEXT:    popq %rbp  ; SKX-NEXT:    vzeroupper @@ -1617,45 +1610,28 @@ define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b,  define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b, i32 %index) {  ; KNL-LABEL: test_extractelement_varible_v8i1:  ; KNL:       ## %bb.0: -; KNL-NEXT:    pushq %rbp -; KNL-NEXT:    .cfi_def_cfa_offset 16 -; KNL-NEXT:    .cfi_offset %rbp, -16 -; KNL-NEXT:    movq %rsp, %rbp -; KNL-NEXT:    .cfi_def_cfa_register %rbp -; KNL-NEXT:    andq $-64, %rsp -; KNL-NEXT:    subq $128, %rsp  ; KNL-NEXT:    ## kill: def %edi killed %edi def %rdi  ; KNL-NEXT:    ## kill: def %ymm1 killed %ymm1 def %zmm1  ; KNL-NEXT:    ## kill: def %ymm0 killed %ymm0 def %zmm0  ; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1 -; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT:    vmovdqa64 %zmm0, (%rsp) +; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT:    vpmovdw %zmm0, %ymm0 +; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)  ; KNL-NEXT:    andl $7, %edi -; KNL-NEXT:    movzbl (%rsp,%rdi,8), %eax +; KNL-NEXT:    movzbl -24(%rsp,%rdi,2), %eax  ; KNL-NEXT:    andl $1, %eax -; KNL-NEXT:    movq %rbp, %rsp -; KNL-NEXT:    popq %rbp  ; KNL-NEXT:    vzeroupper  ; KNL-NEXT:    retq  ;  ; SKX-LABEL: test_extractelement_varible_v8i1:  ; SKX:       ## %bb.0: -; SKX-NEXT:    pushq %rbp -; SKX-NEXT:    .cfi_def_cfa_offset 16 -; SKX-NEXT:    .cfi_offset %rbp, -16 -; SKX-NEXT:    movq %rsp, %rbp -; SKX-NEXT:    .cfi_def_cfa_register %rbp -; SKX-NEXT:    andq $-64, %rsp -; SKX-NEXT:    subq $128, %rsp  ; SKX-NEXT:    ## kill: def %edi killed %edi def %rdi  ; SKX-NEXT:    vpcmpnleud %ymm1, %ymm0, %k0 -; SKX-NEXT:    vpmovm2q %k0, %zmm0 -; SKX-NEXT:    vmovdqa64 %zmm0, (%rsp) +; SKX-NEXT:    vpmovm2w %k0, %xmm0 +; SKX-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)  ; SKX-NEXT:    andl $7, %edi -; SKX-NEXT:    movzbl (%rsp,%rdi,8), %eax +; SKX-NEXT:    movzbl -24(%rsp,%rdi,2), %eax  ; SKX-NEXT:    andl $1, %eax -; SKX-NEXT:    movq %rbp, %rsp -; SKX-NEXT:    popq %rbp  ; SKX-NEXT:    vzeroupper  ; SKX-NEXT:    retq    %t1 = icmp ugt <8 x i32> %a, %b @@ -1667,43 +1643,26 @@ define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b,  define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %b, i32 %index) {  ; KNL-LABEL: test_extractelement_varible_v16i1:  ; KNL:       ## %bb.0: -; KNL-NEXT:    pushq %rbp -; KNL-NEXT:    .cfi_def_cfa_offset 16 -; KNL-NEXT:    .cfi_offset %rbp, -16 -; KNL-NEXT:    movq %rsp, %rbp -; KNL-NEXT:    .cfi_def_cfa_register %rbp -; KNL-NEXT:    andq $-64, %rsp -; KNL-NEXT:    subq $128, %rsp  ; KNL-NEXT:    ## kill: def %edi killed %edi def %rdi  ; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1  ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT:    vmovdqa32 %zmm0, (%rsp) +; KNL-NEXT:    vpmovdb %zmm0, %xmm0 +; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)  ; KNL-NEXT:    andl $15, %edi -; KNL-NEXT:    movzbl (%rsp,%rdi,4), %eax +; KNL-NEXT:    movzbl -24(%rsp,%rdi), %eax  ; KNL-NEXT:    andl $1, %eax -; KNL-NEXT:    movq %rbp, %rsp -; KNL-NEXT:    popq %rbp  ; KNL-NEXT:    vzeroupper  ; KNL-NEXT:    retq  ;  ; SKX-LABEL: test_extractelement_varible_v16i1:  ; SKX:       ## %bb.0: -; SKX-NEXT:    pushq %rbp -; SKX-NEXT:    .cfi_def_cfa_offset 16 -; SKX-NEXT:    .cfi_offset %rbp, -16 -; SKX-NEXT:    movq %rsp, %rbp -; SKX-NEXT:    .cfi_def_cfa_register %rbp -; SKX-NEXT:    andq $-64, %rsp -; SKX-NEXT:    subq $128, %rsp  ; SKX-NEXT:    ## kill: def %edi killed %edi def %rdi  ; SKX-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0 -; SKX-NEXT:    vpmovm2d %k0, %zmm0 -; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp) +; SKX-NEXT:    vpmovm2b %k0, %xmm0 +; SKX-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)  ; SKX-NEXT:    andl $15, %edi -; SKX-NEXT:    movzbl (%rsp,%rdi,4), %eax +; SKX-NEXT:    movzbl -24(%rsp,%rdi), %eax  ; SKX-NEXT:    andl $1, %eax -; SKX-NEXT:    movq %rbp, %rsp -; SKX-NEXT:    popq %rbp  ; SKX-NEXT:    vzeroupper  ; SKX-NEXT:    retq    %t1 = icmp ugt <16 x i32> %a, %b @@ -1729,8 +1688,7 @@ define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b,  ; KNL-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0  ; KNL-NEXT:    vmovdqa %ymm0, (%rsp)  ; KNL-NEXT:    andl $31, %edi -; KNL-NEXT:    movq %rsp, %rax -; KNL-NEXT:    movzbl (%rdi,%rax), %eax +; KNL-NEXT:    movzbl (%rsp,%rdi), %eax  ; KNL-NEXT:    andl $1, %eax  ; KNL-NEXT:    movq %rbp, %rsp  ; KNL-NEXT:    popq %rbp @@ -1744,14 +1702,14 @@ define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b,  ; SKX-NEXT:    .cfi_offset %rbp, -16  ; SKX-NEXT:    movq %rsp, %rbp  ; SKX-NEXT:    .cfi_def_cfa_register %rbp -; SKX-NEXT:    andq $-64, %rsp -; SKX-NEXT:    subq $128, %rsp +; SKX-NEXT:    andq $-32, %rsp +; SKX-NEXT:    subq $64, %rsp  ; SKX-NEXT:    ## kill: def %edi killed %edi def %rdi  ; SKX-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0 -; SKX-NEXT:    vpmovm2w %k0, %zmm0 -; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp) +; SKX-NEXT:    vpmovm2b %k0, %ymm0 +; SKX-NEXT:    vmovdqa %ymm0, (%rsp)  ; SKX-NEXT:    andl $31, %edi -; SKX-NEXT:    movzbl (%rsp,%rdi,2), %eax +; SKX-NEXT:    movzbl (%rsp,%rdi), %eax  ; SKX-NEXT:    andl $1, %eax  ; SKX-NEXT:    movq %rbp, %rsp  ; SKX-NEXT:    popq %rbp @@ -1792,8 +1750,7 @@ define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {  ; KNL-NEXT:    andl $31, %esi  ; KNL-NEXT:    testb %dil, %dil  ; KNL-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp) -; KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rax -; KNL-NEXT:    setne (%rsi,%rax) +; KNL-NEXT:    setne 32(%rsp,%rsi)  ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm0  ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1  ; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1 @@ -1817,20 +1774,18 @@ define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {  ; SKX-NEXT:    .cfi_offset %rbp, -16  ; SKX-NEXT:    movq %rsp, %rbp  ; SKX-NEXT:    .cfi_def_cfa_register %rbp -; SKX-NEXT:    andq $-64, %rsp -; SKX-NEXT:    subq $128, %rsp +; SKX-NEXT:    andq $-32, %rsp +; SKX-NEXT:    subq $64, %rsp  ; SKX-NEXT:    ## kill: def %esi killed %esi def %rsi  ; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT:    vpcmpnleub %ymm1, %ymm0, %k1 -; SKX-NEXT:    xorl %eax, %eax -; SKX-NEXT:    testb %dil, %dil -; SKX-NEXT:    setne %al -; SKX-NEXT:    vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} -; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp) +; SKX-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0  ; SKX-NEXT:    andl $31, %esi -; SKX-NEXT:    movw %ax, (%rsp,%rsi,2) -; SKX-NEXT:    vpsllw $15, (%rsp), %zmm0 -; SKX-NEXT:    vpmovw2m %zmm0, %k0 +; SKX-NEXT:    testb %dil, %dil +; SKX-NEXT:    vpmovm2b %k0, %ymm0 +; SKX-NEXT:    vmovdqa %ymm0, (%rsp) +; SKX-NEXT:    setne (%rsp,%rsi) +; SKX-NEXT:    vpsllw $7, (%rsp), %ymm0 +; SKX-NEXT:    vpmovb2m %ymm0, %k0  ; SKX-NEXT:    kmovd %k0, %eax  ; SKX-NEXT:    movq %rbp, %rsp  ; SKX-NEXT:    popq %rbp @@ -1863,8 +1818,7 @@ define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {  ; KNL-NEXT:    testb %dil, %dil  ; KNL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)  ; KNL-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp) -; KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rax -; KNL-NEXT:    setne (%rsi,%rax) +; KNL-NEXT:    setne 64(%rsp,%rsi)  ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm0  ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1  ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm2 @@ -1905,13 +1859,12 @@ define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {  ; SKX-NEXT:    subq $128, %rsp  ; SKX-NEXT:    ## kill: def %esi killed %esi def %rsi  ; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT:    vpcmpnleub %zmm1, %zmm0, %k1 +; SKX-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0  ; SKX-NEXT:    andl $63, %esi  ; SKX-NEXT:    testb %dil, %dil -; SKX-NEXT:    vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} +; SKX-NEXT:    vpmovm2b %k0, %zmm0  ; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp) -; SKX-NEXT:    movq %rsp, %rax -; SKX-NEXT:    setne (%rsi,%rax) +; SKX-NEXT:    setne (%rsp,%rsi)  ; SKX-NEXT:    vpsllw $7, (%rsp), %zmm0  ; SKX-NEXT:    vpmovb2m %zmm0, %k0  ; SKX-NEXT:    kmovq %k0, %rax @@ -2050,8 +2003,7 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {  ; KNL-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)  ; KNL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)  ; KNL-NEXT:    vmovdqa %ymm2, {{[0-9]+}}(%rsp) -; KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; KNL-NEXT:    setne (%rax,%rcx) +; KNL-NEXT:    setne 128(%rsp,%rax)  ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1  ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm2  ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm3 @@ -2215,18 +2167,16 @@ define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {  ; SKX-NEXT:    vpinsrb $15, 728(%rbp), %xmm2, %xmm2  ; SKX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1  ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT:    vpcmpnleub %zmm2, %zmm0, %k1 -; SKX-NEXT:    vpcmpnleub %zmm2, %zmm1, %k2 +; SKX-NEXT:    vpcmpnleub %zmm2, %zmm0, %k0 +; SKX-NEXT:    vpcmpnleub %zmm2, %zmm1, %k1  ; SKX-NEXT:    movl 744(%rbp), %eax  ; SKX-NEXT:    andl $127, %eax  ; SKX-NEXT:    cmpb $0, 736(%rbp) -; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SKX-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k2} {z} -; SKX-NEXT:    vmovdqa32 %zmm1, {{[0-9]+}}(%rsp) -; SKX-NEXT:    vmovdqu8 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT:    vpmovm2b %k1, %zmm0 +; SKX-NEXT:    vmovdqa32 %zmm0, {{[0-9]+}}(%rsp) +; SKX-NEXT:    vpmovm2b %k0, %zmm0  ; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp) -; SKX-NEXT:    movq %rsp, %rcx -; SKX-NEXT:    setne (%rax,%rcx) +; SKX-NEXT:    setne (%rsp,%rax)  ; SKX-NEXT:    vpsllw $7, {{[0-9]+}}(%rsp), %zmm0  ; SKX-NEXT:    vpmovb2m %zmm0, %k0  ; SKX-NEXT:    vpsllw $7, (%rsp), %zmm0 @@ -2270,8 +2220,7 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index  ; KNL-NEXT:    vmovdqa %ymm2, {{[0-9]+}}(%rsp)  ; KNL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)  ; KNL-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp) -; KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rax -; KNL-NEXT:    setne (%rsi,%rax) +; KNL-NEXT:    setne 128(%rsp,%rsi)  ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm1  ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm2  ; KNL-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %ymm3 @@ -2336,17 +2285,15 @@ define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index  ; SKX-NEXT:    subq $256, %rsp ## imm = 0x100  ; SKX-NEXT:    ## kill: def %esi killed %esi def %rsi  ; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT:    vpcmpnleub %zmm2, %zmm0, %k1 -; SKX-NEXT:    vpcmpnleub %zmm2, %zmm1, %k2 +; SKX-NEXT:    vpcmpnleub %zmm2, %zmm0, %k0 +; SKX-NEXT:    vpcmpnleub %zmm2, %zmm1, %k1  ; SKX-NEXT:    andl $127, %esi  ; SKX-NEXT:    testb %dil, %dil -; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SKX-NEXT:    vmovdqu8 %zmm0, %zmm1 {%k2} {z} -; SKX-NEXT:    vmovdqa32 %zmm1, {{[0-9]+}}(%rsp) -; SKX-NEXT:    vmovdqu8 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT:    vpmovm2b %k1, %zmm0 +; SKX-NEXT:    vmovdqa32 %zmm0, {{[0-9]+}}(%rsp) +; SKX-NEXT:    vpmovm2b %k0, %zmm0  ; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp) -; SKX-NEXT:    movq %rsp, %rax -; SKX-NEXT:    setne (%rsi,%rax) +; SKX-NEXT:    setne (%rsp,%rsi)  ; SKX-NEXT:    vpsllw $7, {{[0-9]+}}(%rsp), %zmm0  ; SKX-NEXT:    vpmovb2m %zmm0, %k0  ; SKX-NEXT:    vpsllw $7, (%rsp), %zmm0 diff --git a/test/CodeGen/X86/avx512-insert-extract_i1.ll b/test/CodeGen/X86/avx512-insert-extract_i1.ll index e28e384ae996..9283fd32d746 100644 --- a/test/CodeGen/X86/avx512-insert-extract_i1.ll +++ b/test/CodeGen/X86/avx512-insert-extract_i1.ll @@ -18,8 +18,7 @@ define zeroext i8 @test_extractelement_varible_v64i1(<64 x i8> %a, <64 x i8> %b,  ; SKX-NEXT:    vpmovm2b %k0, %zmm0  ; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp)  ; SKX-NEXT:    andl $63, %edi -; SKX-NEXT:    movq %rsp, %rax -; SKX-NEXT:    movzbl (%rdi,%rax), %eax +; SKX-NEXT:    movzbl (%rsp,%rdi), %eax  ; SKX-NEXT:    andl $1, %eax  ; SKX-NEXT:    movq %rbp, %rsp  ; SKX-NEXT:    popq %rbp diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index dfe42d53483f..4877157d911d 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -1783,20 +1783,19 @@ define void @ktest_2(<32 x float> %in, float * %base) {  ; KNL-NEXT:    vmovups (%rdi), %zmm2  ; KNL-NEXT:    vmovups 64(%rdi), %zmm3  ; KNL-NEXT:    vcmpltps %zmm0, %zmm2, %k1 -; KNL-NEXT:    movl {{.*}}(%rip), %eax -; KNL-NEXT:    vpbroadcastd %eax, %zmm2 {%k1} {z} +; KNL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}  ; KNL-NEXT:    vpmovdb %zmm2, %xmm2  ; KNL-NEXT:    vcmpltps %zmm1, %zmm3, %k2 -; KNL-NEXT:    vpbroadcastd %eax, %zmm3 {%k2} {z} +; KNL-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}  ; KNL-NEXT:    vpmovdb %zmm3, %xmm3  ; KNL-NEXT:    vmovups 68(%rdi), %zmm4 {%k2} {z}  ; KNL-NEXT:    vmovups 4(%rdi), %zmm5 {%k1} {z}  ; KNL-NEXT:    vcmpltps %zmm5, %zmm0, %k1 -; KNL-NEXT:    vpbroadcastd %eax, %zmm5 {%k1} {z} +; KNL-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}  ; KNL-NEXT:    vpmovdb %zmm5, %xmm5  ; KNL-NEXT:    vpor %xmm5, %xmm2, %xmm2  ; KNL-NEXT:    vcmpltps %zmm4, %zmm1, %k1 -; KNL-NEXT:    vpbroadcastd %eax, %zmm4 {%k1} {z} +; KNL-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}  ; KNL-NEXT:    vpmovdb %zmm4, %xmm4  ; KNL-NEXT:    vpor %xmm4, %xmm3, %xmm3  ; KNL-NEXT:    vpmovsxbd %xmm3, %zmm3 @@ -1886,20 +1885,19 @@ define void @ktest_2(<32 x float> %in, float * %base) {  ; AVX512DQ-NEXT:    vmovups (%rdi), %zmm2  ; AVX512DQ-NEXT:    vmovups 64(%rdi), %zmm3  ; AVX512DQ-NEXT:    vcmpltps %zmm0, %zmm2, %k1 -; AVX512DQ-NEXT:    movl {{.*}}(%rip), %eax -; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm2 {%k1} {z} +; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm2  ; AVX512DQ-NEXT:    vpmovdb %zmm2, %xmm2  ; AVX512DQ-NEXT:    vcmpltps %zmm1, %zmm3, %k2 -; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm3 {%k2} {z} +; AVX512DQ-NEXT:    vpmovm2d %k2, %zmm3  ; AVX512DQ-NEXT:    vpmovdb %zmm3, %xmm3  ; AVX512DQ-NEXT:    vmovups 68(%rdi), %zmm4 {%k2} {z}  ; AVX512DQ-NEXT:    vmovups 4(%rdi), %zmm5 {%k1} {z} -; AVX512DQ-NEXT:    vcmpltps %zmm5, %zmm0, %k1 -; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm5 {%k1} {z} +; AVX512DQ-NEXT:    vcmpltps %zmm5, %zmm0, %k0 +; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm5  ; AVX512DQ-NEXT:    vpmovdb %zmm5, %xmm5  ; AVX512DQ-NEXT:    vpor %xmm5, %xmm2, %xmm2 -; AVX512DQ-NEXT:    vcmpltps %zmm4, %zmm1, %k1 -; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm4 {%k1} {z} +; AVX512DQ-NEXT:    vcmpltps %zmm4, %zmm1, %k0 +; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm4  ; AVX512DQ-NEXT:    vpmovdb %zmm4, %xmm4  ; AVX512DQ-NEXT:    vpor %xmm4, %xmm3, %xmm3  ; AVX512DQ-NEXT:    vpmovsxbd %xmm3, %zmm3 diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll index 1b450b98a6d5..78111874b58a 100755 --- a/test/CodeGen/X86/avx512-schedule.ll +++ b/test/CodeGen/X86/avx512-schedule.ll @@ -4376,20 +4376,16 @@ define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {  define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) {  ; GENERIC-LABEL: trunc_4i32_to_4i1:  ; GENERIC:       # %bb.0: +; GENERIC-NEXT:    vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]  ; GENERIC-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT:    vptestmd %xmm0, %xmm0, %k1 # sched: [1:1.00] -; GENERIC-NEXT:    vpslld $31, %xmm1, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT:    vptestmd %xmm0, %xmm0, %k0 {%k1} # sched: [1:1.00] -; GENERIC-NEXT:    vpmovm2d %k0, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT:    vpsrad $31, %xmm0, %xmm0 # sched: [1:1.00]  ; GENERIC-NEXT:    retq # sched: [1:1.00]  ;  ; SKX-LABEL: trunc_4i32_to_4i1:  ; SKX:       # %bb.0: +; SKX-NEXT:    vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]  ; SKX-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] -; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] -; SKX-NEXT:    vpslld $31, %xmm1, %xmm0 # sched: [1:0.50] -; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k0 {%k1} # sched: [3:1.00] -; SKX-NEXT:    vpmovm2d %k0, %xmm0 # sched: [1:0.25] +; SKX-NEXT:    vpsrad $31, %xmm0, %xmm0 # sched: [1:0.50]  ; SKX-NEXT:    retq # sched: [7:1.00]    %mask_a = trunc <4 x i32>%a to <4 x i1>    %mask_b = trunc <4 x i32>%b to <4 x i1> diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index df88f0fca456..0601c011e290 100644 --- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -1488,12 +1488,10 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x  define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {  ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask0:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] -; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7] -; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] -; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; CHECK-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0 +; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm1 +; CHECK-NEXT:    vmovdqa %xmm1, %xmm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12> @@ -1503,13 +1501,11 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i  ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0:  ; CHECK:       # %bb.0:  ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] -; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7] -; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7] -; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 -; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT:    vpermi2d %ymm3, %ymm0, %ymm4 +; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm2, %k1 +; CHECK-NEXT:    vpblendmd %xmm4, %xmm1, %xmm0 {%k1}  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12> @@ -1522,13 +1518,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x  ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask0:  ; CHECK:       # %bb.0:  ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,0,3,4,6,4,7] -; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7] -; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 -; CHECK-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12] +; CHECK-NEXT:    vpermi2d %ymm2, %ymm0, %ymm3 +; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT:    vpcmpeqd %xmm0, %xmm1, %k1 +; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12> @@ -1863,14 +1857,12 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp, <4  ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask1:  ; CHECK:       # %bb.0:  ; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm2 -; CHECK-NEXT:    vpshufd {{.*#+}} ymm3 = ymm2[3,1,2,3,7,5,6,7] -; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm2 -; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,3,2,4,5,7,6] -; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7] -; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm1, %k1 -; CHECK-NEXT:    vmovdqa32 %xmm2, %xmm0 {%k1} +; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,5,3,2,15,5,7,6] +; CHECK-NEXT:    vpermi2d %ymm2, %ymm3, %ymm4 +; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 +; CHECK-NEXT:    vmovdqa32 %xmm4, %xmm0 {%k1}  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %vec = load <16 x i32>, <16 x i32>* %vp @@ -1884,14 +1876,12 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask1(<16 x i32>* %vp,  ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask1:  ; CHECK:       # %bb.0:  ; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1 -; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[3,1,2,3,7,5,6,7] -; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm1 -; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,3,2,4,5,7,6] -; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7] -; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm0, %k1 -; CHECK-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,5,3,2,15,5,7,6] +; CHECK-NEXT:    vpermi2d %ymm1, %ymm2, %ymm3 +; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT:    vmovdqa32 %xmm3, %xmm0 {%k1} {z}  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %vec = load <16 x i32>, <16 x i32>* %vp @@ -2298,13 +2288,12 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i  define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {  ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[3,1,2,3] -; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,3,1] -; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [7,3,3,1] +; CHECK-NEXT:    vpermi2q %ymm0, %ymm3, %ymm4 +; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT:    vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT:    vpblendmq %ymm4, %ymm1, %ymm0 {%k1}  ; CHECK-NEXT:    retq    %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>    %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2315,13 +2304,12 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64  define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {  ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[3,1,2,3] -; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,3,1] -; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,3,3,1] +; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT:    vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} +; CHECK-NEXT:    vmovdqa %ymm2, %ymm0  ; CHECK-NEXT:    retq    %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>    %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2332,12 +2320,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64  ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5:  ; CHECK:       # %bb.0:  ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,1,4,5,4,5] -; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] -; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,1,0,6] +; CHECK-NEXT:    vpermi2q %ymm3, %ymm0, %ymm4 +; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT:    vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT:    vpblendmq %ymm4, %ymm1, %ymm0 {%k1}  ; CHECK-NEXT:    retq    %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>    %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2348,13 +2335,12 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64  define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %mask) {  ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask5:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] -; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] -; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,1,0,6] +; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT:    vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT:    vmovdqa %ymm2, %ymm0  ; CHECK-NEXT:    retq    %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>    %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2405,13 +2391,12 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i  define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {  ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,0,3,3] -; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0 -; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,0,3,4] +; CHECK-NEXT:    vpermi2q %ymm3, %ymm0, %ymm4 +; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT:    vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT:    vpblendmq %ymm4, %ymm1, %ymm0 {%k1}  ; CHECK-NEXT:    retq    %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>    %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2422,13 +2407,12 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64  define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {  ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,0,3,3] -; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0 -; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,0,3,4] +; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT:    vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT:    vmovdqa %ymm2, %ymm0  ; CHECK-NEXT:    retq    %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>    %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2585,12 +2569,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x  ; CHECK:       # %bb.0:  ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2  ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,1,2,1] -; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3] -; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7] -; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm1, %k1 -; CHECK-NEXT:    vmovdqa64 %ymm2, %ymm0 {%k1} +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [7,1,1,5] +; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm4 +; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}  ; CHECK-NEXT:    retq    %vec = load <8 x i64>, <8 x i64>* %vp    %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5> @@ -2602,14 +2585,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x  define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) {  ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,1,2,1] -; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm0, %k1 -; CHECK-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,1,1,5] +; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT:    vmovdqa %ymm1, %ymm0  ; CHECK-NEXT:    retq    %vec = load <8 x i64>, <8 x i64>* %vp    %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 1, i32 1, i32 5> @@ -2669,12 +2651,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x  ; CHECK:       # %bb.0:  ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2  ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,1,4,5,4,5] -; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1] -; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5],ymm2[6,7] -; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm1, %k1 -; CHECK-NEXT:    vmovdqa64 %ymm2, %ymm0 {%k1} +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,4,6,1] +; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm4 +; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}  ; CHECK-NEXT:    retq    %vec = load <8 x i64>, <8 x i64>* %vp    %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1> @@ -2686,14 +2667,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x  define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask4(<8 x i64>* %vp, <4 x i64> %mask) {  ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask4:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,1,4,5,4,5] -; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] -; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7] -; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm0, %k1 -; CHECK-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,4,6,1] +; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT:    vmovdqa %ymm1, %ymm0  ; CHECK-NEXT:    retq    %vec = load <8 x i64>, <8 x i64>* %vp    %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 0, i32 4, i32 6, i32 1> @@ -2739,11 +2719,10 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4  define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp) {  ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask6:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vmovaps (%rdi), %zmm0 -; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3] -; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,2] -; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 +; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [7,2,3,2] +; CHECK-NEXT:    vpermi2q %ymm2, %ymm1, %ymm0  ; CHECK-NEXT:    retq    %vec = load <8 x i64>, <8 x i64>* %vp    %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2> @@ -2754,12 +2733,11 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x  ; CHECK:       # %bb.0:  ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2  ; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,1,2,3] -; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,3,2] -; CHECK-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm1, %k1 -; CHECK-NEXT:    vmovdqa64 %ymm2, %ymm0 {%k1} +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm4 = [7,2,3,2] +; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm4 +; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT:    vmovdqa64 %ymm4, %ymm0 {%k1}  ; CHECK-NEXT:    retq    %vec = load <8 x i64>, <8 x i64>* %vp    %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2> @@ -2771,14 +2749,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x  define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask6(<8 x i64>* %vp, <4 x i64> %mask) {  ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask6:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,1,2,3] -; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,3,2] -; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm0, %k1 -; CHECK-NEXT:    vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT:    vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [7,2,3,2] +; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT:    vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT:    vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT:    vmovdqa %ymm1, %ymm0  ; CHECK-NEXT:    retq    %vec = load <8 x i64>, <8 x i64>* %vp    %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 2, i32 3, i32 2> @@ -3307,14 +3284,13 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %v  define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {  ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = <0,4,u,u,6,1,4,4> -; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm3 -; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3] -; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1 -; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT:    vmovddup {{.*#+}} xmm3 = xmm3[0,0] +; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [0,4,10,11,6,1,4,4] +; CHECK-NEXT:    vpermi2ps %ymm3, %ymm0, %ymm4 +; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT:    vcmpeqps %ymm0, %ymm2, %k1 +; CHECK-NEXT:    vblendmps %ymm4, %ymm1, %ymm0 {%k1}  ; CHECK-NEXT:    retq    %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>    %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -3325,14 +3301,13 @@ define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec  define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %mask) {  ; CHECK-LABEL: test_masked_z_16xfloat_to_8xfloat_perm_mask2:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = <0,4,u,u,6,1,4,4> -; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm2 -; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3] -; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1 -; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT:    vmovddup {{.*#+}} xmm3 = xmm2[0,0] +; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,4,10,11,6,1,4,4] +; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT:    vcmpeqps %ymm4, %ymm1, %k1 +; CHECK-NEXT:    vpermi2ps %ymm3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT:    vmovaps %ymm2, %ymm0  ; CHECK-NEXT:    retq    %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>    %cmp = fcmp oeq <8 x float> %mask, zeroinitializer @@ -3775,12 +3750,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float>*  ; CHECK:       # %bb.0:  ; CHECK-NEXT:    vmovaps (%rdi), %zmm2  ; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT:    vshufps {{.*#+}} ymm2 = ymm3[0,2],ymm2[2,3],ymm3[4,6],ymm2[6,7] -; CHECK-NEXT:    vpermilps {{.*#+}} ymm2 = ymm2[0,2,1,3,4,6,5,7] -; CHECK-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1 -; CHECK-NEXT:    vmovaps %xmm2, %xmm0 {%k1} +; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [0,10,6,15,4,14,6,15] +; CHECK-NEXT:    vpermi2ps %ymm2, %ymm3, %ymm4 +; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1 +; CHECK-NEXT:    vmovaps %xmm4, %xmm0 {%k1}  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %vec = load <16 x float>, <16 x float>* %vp @@ -3795,12 +3769,11 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask1(<16 x float  ; CHECK:       # %bb.0:  ; CHECK-NEXT:    vmovaps (%rdi), %zmm1  ; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,3],ymm2[4,6],ymm1[6,7] -; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] -; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1 -; CHECK-NEXT:    vmovaps %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [0,10,6,15,4,14,6,15] +; CHECK-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm3 +; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1 +; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1} {z}  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %vec = load <16 x float>, <16 x float>* %vp @@ -3815,12 +3788,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float>*  ; CHECK:       # %bb.0:  ; CHECK-NEXT:    vmovaps (%rdi), %zmm2  ; CHECK-NEXT:    vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[0,0],ymm2[6,4],ymm3[4,4] -; CHECK-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm3[2,3],ymm2[6,4],ymm3[6,7] -; CHECK-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,3] -; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT:    vcmpeqps %xmm3, %xmm1, %k1 -; CHECK-NEXT:    vmovaps %xmm2, %xmm0 {%k1} +; CHECK-NEXT:    vmovaps {{.*#+}} ymm4 = [4,14,4,14,4,14,6,7] +; CHECK-NEXT:    vpermi2ps %ymm2, %ymm3, %ymm4 +; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1 +; CHECK-NEXT:    vmovaps %xmm4, %xmm0 {%k1}  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %vec = load <16 x float>, <16 x float>* %vp @@ -3835,12 +3807,11 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask2(<16 x float  ; CHECK:       # %bb.0:  ; CHECK-NEXT:    vmovaps (%rdi), %zmm1  ; CHECK-NEXT:    vextractf64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[0,0],ymm1[6,4],ymm2[4,4] -; CHECK-NEXT:    vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,3],ymm1[6,4],ymm2[6,7] -; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] -; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT:    vcmpeqps %xmm2, %xmm0, %k1 -; CHECK-NEXT:    vmovaps %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [4,14,4,14,4,14,6,7] +; CHECK-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm3 +; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1 +; CHECK-NEXT:    vmovaps %xmm3, %xmm0 {%k1} {z}  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq    %vec = load <16 x float>, <16 x float>* %vp diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll index 1182bbf94ec5..6bee0de181ab 100644 --- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll +++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll @@ -30,10 +30,9 @@ define <8 x i1> @test2(<2 x i1> %a) {  ; CHECK:       # %bb.0:  ; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0  ; CHECK-NEXT:    vptestmq %xmm0, %xmm0, %k0 -; CHECK-NEXT:    vpmovm2q %k0, %zmm0 -; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; CHECK-NEXT:    vpmovq2m %zmm0, %k0 +; CHECK-NEXT:    vpmovm2d %k0, %ymm0 +; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; CHECK-NEXT:    vpmovd2m %ymm0, %k0  ; CHECK-NEXT:    vpmovm2w %k0, %xmm0  ; CHECK-NEXT:    vzeroupper  ; CHECK-NEXT:    retq diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll index fc684e54b063..826a4538f3f1 100644 --- a/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/test/CodeGen/X86/avx512-vec-cmp.ll @@ -192,15 +192,14 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {  ; KNL-NEXT:    andq $-32, %rsp  ; KNL-NEXT:    subq $32, %rsp  ; KNL-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 -; KNL-NEXT:    movl {{.*}}(%rip), %eax -; KNL-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z} +; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}  ; KNL-NEXT:    vpmovdb %zmm1, %xmm1  ; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1  ; KNL-NEXT:    vpslld $31, %zmm1, %zmm1  ; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0  ; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; KNL-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 -; KNL-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z} +; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; KNL-NEXT:    vpmovdb %zmm0, %xmm0  ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0  ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0 diff --git a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll index 9863a0a7d283..5f4b050b863d 100644 --- a/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -101,7 +101,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x  ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -147,7 +147,7 @@ define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -282,7 +282,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -333,7 +333,7 @@ define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -1354,7 +1354,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x  ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -1401,7 +1401,7 @@ define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -1543,7 +1543,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -1595,7 +1595,7 @@ define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -4799,7 +4799,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__  ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -4839,7 +4839,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>  ; NoVLX-NEXT:    vpcmpeqd (%rdi), %zmm0, %k1  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -4882,7 +4882,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x  ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -4926,7 +4926,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -4970,7 +4970,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %_  ; NoVLX-NEXT:    vpcmpeqd (%rdi){1to16}, %zmm0, %k1  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -5014,7 +5014,7 @@ define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u  ; NoVLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -5061,7 +5061,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -5106,7 +5106,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -5154,7 +5154,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -5203,7 +5203,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -5252,7 +5252,7 @@ define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %_  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -5301,7 +5301,7 @@ define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -9211,7 +9211,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2  ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -9257,7 +9257,7 @@ define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -9392,7 +9392,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -9443,7 +9443,7 @@ define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -10464,7 +10464,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4  ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -10511,7 +10511,7 @@ define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -10653,7 +10653,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -10705,7 +10705,7 @@ define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -13909,7 +13909,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_  ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -13949,7 +13949,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64  ; NoVLX-NEXT:    vpcmpgtd (%rdi), %zmm0, %k1  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -13992,7 +13992,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8  ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -14036,7 +14036,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -14080,7 +14080,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %  ; NoVLX-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k1  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -14124,7 +14124,7 @@ define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__  ; NoVLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -14171,7 +14171,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -14216,7 +14216,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -14264,7 +14264,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -14313,7 +14313,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -14362,7 +14362,7 @@ define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -14411,7 +14411,7 @@ define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -18328,7 +18328,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask(i16 zeroext %__u, <2  ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -18377,7 +18377,7 @@ define zeroext i32 @test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -18519,7 +18519,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask(i16 zeroext %__u, <2  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -18573,7 +18573,7 @@ define zeroext i64 @test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -19641,7 +19641,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask(i16 zeroext %__u, <4  ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -19691,7 +19691,7 @@ define zeroext i32 @test_masked_vpcmpsgew_v16i1_v32i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -19840,7 +19840,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask(i16 zeroext %__u, <4  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -19895,7 +19895,7 @@ define zeroext i64 @test_masked_vpcmpsgew_v16i1_v64i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -23147,7 +23147,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_  ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k1  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -23187,7 +23187,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64  ; NoVLX-NEXT:    vpcmpnltd (%rdi), %zmm0, %k1  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -23230,7 +23230,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask(i16 zeroext %__u, <8  ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -23274,7 +23274,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    vpcmpnltd (%rsi), %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -23318,7 +23318,7 @@ define zeroext i32 @test_vpcmpsged_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %  ; NoVLX-NEXT:    vpcmpnltd (%rdi){1to16}, %zmm0, %k1  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -23362,7 +23362,7 @@ define zeroext i32 @test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b(i16 zeroext %__  ; NoVLX-NEXT:    vpcmpnltd (%rsi){1to16}, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -23409,7 +23409,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -23454,7 +23454,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -23502,7 +23502,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask(i16 zeroext %__u, <8  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -23551,7 +23551,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -23600,7 +23600,7 @@ define zeroext i64 @test_vpcmpsged_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -23649,7 +23649,7 @@ define zeroext i64 @test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b(i16 zeroext %__  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -27664,7 +27664,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask(i16 zeroext %__u, <2  ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -27713,7 +27713,7 @@ define zeroext i32 @test_masked_vpcmpultb_v16i1_v32i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -27857,7 +27857,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask(i16 zeroext %__u, <2  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -27911,7 +27911,7 @@ define zeroext i64 @test_masked_vpcmpultb_v16i1_v64i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -28989,7 +28989,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask(i16 zeroext %__u, <4  ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -29039,7 +29039,7 @@ define zeroext i32 @test_masked_vpcmpultw_v16i1_v32i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -29190,7 +29190,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask(i16 zeroext %__u, <4  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -29245,7 +29245,7 @@ define zeroext i64 @test_masked_vpcmpultw_v16i1_v64i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -32541,7 +32541,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_  ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k1  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -32581,7 +32581,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64  ; NoVLX-NEXT:    vpcmpltud (%rdi), %zmm0, %k1  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -32624,7 +32624,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask(i16 zeroext %__u, <8  ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -32668,7 +32668,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    vpcmpltud (%rsi), %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -32712,7 +32712,7 @@ define zeroext i32 @test_vpcmpultd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %  ; NoVLX-NEXT:    vpcmpltud (%rdi){1to16}, %zmm0, %k1  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -32756,7 +32756,7 @@ define zeroext i32 @test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b(i16 zeroext %__  ; NoVLX-NEXT:    vpcmpltud (%rsi){1to16}, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -32803,7 +32803,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -32848,7 +32848,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -32896,7 +32896,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask(i16 zeroext %__u, <8  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -32945,7 +32945,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -32994,7 +32994,7 @@ define zeroext i64 @test_vpcmpultd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -33043,7 +33043,7 @@ define zeroext i64 @test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b(i16 zeroext %__  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -39363,7 +39363,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %_  ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k1  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -39403,7 +39403,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64  ; NoVLX-NEXT:    vcmpeqps (%rdi), %zmm0, %k1  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -39444,7 +39444,7 @@ define zeroext i32 @test_vcmpoeqps_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, float*  ; NoVLX-NEXT:    vcmpeqps (%rdi){1to16}, %zmm0, %k1  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -39488,7 +39488,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask(i16 zeroext %__u, <8  ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -39532,7 +39532,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    vcmpeqps (%rsi), %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -39577,7 +39577,7 @@ define zeroext i32 @test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b(i16 zeroext %__  ; NoVLX-NEXT:    vcmpeqps (%rsi){1to16}, %zmm0, %k1 {%k1}  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -39666,7 +39666,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %_  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -39711,7 +39711,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -39757,7 +39757,7 @@ define zeroext i64 @test_vcmpoeqps_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, float*  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -39806,7 +39806,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask(i16 zeroext %__u, <8  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -39855,7 +39855,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem(i16 zeroext %__u,  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -39905,7 +39905,7 @@ define zeroext i64 @test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b(i16 zeroext %__  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 @@ -43827,7 +43827,7 @@ define i32 @test_cmpm_rnd_zero(<16 x float> %a, <16 x float> %b) {  ; NoVLX-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k1  ; NoVLX-NEXT:    kxorw %k0, %k0, %k0  ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) -; NoVLX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0  ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0  ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0 diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll index 2276e5634537..78c44e4dca3b 100644 --- a/test/CodeGen/X86/bitcast-and-setcc-128.ll +++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -30,13 +30,13 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) {  ; AVX512F-LABEL: v8i16:  ; AVX512F:       # %bb.0:  ; AVX512F-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX512F-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k1  ; AVX512F-NEXT:    vpcmpgtw %xmm3, %xmm2, %xmm0 -; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1} +; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX512F-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k0 {%k1}  ; AVX512F-NEXT:    kmovw %k0, %eax  ; AVX512F-NEXT:    # kill: def %al killed %al killed %eax  ; AVX512F-NEXT:    vzeroupper @@ -943,12 +943,12 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {  ; AVX512F-NEXT:    vpsllw $8, %xmm0, %xmm0  ; AVX512F-NEXT:    vpsraw $8, %xmm0, %xmm0  ; AVX512F-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT:    vpmovsxwq %xmm2, %zmm0 -; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1} +; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX512F-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k1 +; AVX512F-NEXT:    vpmovsxwd %xmm2, %ymm0 +; AVX512F-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k0 {%k1}  ; AVX512F-NEXT:    kmovw %k0, %eax  ; AVX512F-NEXT:    # kill: def %al killed %al killed %eax  ; AVX512F-NEXT:    vzeroupper diff --git a/test/CodeGen/X86/bitcast-setcc-128.ll b/test/CodeGen/X86/bitcast-setcc-128.ll index 7d0381837b70..8fdacb7b79d6 100644 --- a/test/CodeGen/X86/bitcast-setcc-128.ll +++ b/test/CodeGen/X86/bitcast-setcc-128.ll @@ -26,9 +26,9 @@ define i8 @v8i16(<8 x i16> %a, <8 x i16> %b) {  ; AVX512F-LABEL: v8i16:  ; AVX512F:       # %bb.0:  ; AVX512F-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX512F-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k0  ; AVX512F-NEXT:    kmovw %k0, %eax  ; AVX512F-NEXT:    # kill: def %al killed %al killed %eax  ; AVX512F-NEXT:    vzeroupper @@ -640,9 +640,9 @@ define i8 @v8i8(<8 x i8> %a, <8 x i8> %b) {  ; AVX512F-NEXT:    vpsllw $8, %xmm0, %xmm0  ; AVX512F-NEXT:    vpsraw $8, %xmm0, %xmm0  ; AVX512F-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0 -; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0 -; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0 +; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX512F-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k0  ; AVX512F-NEXT:    kmovw %k0, %eax  ; AVX512F-NEXT:    # kill: def %al killed %al killed %eax  ; AVX512F-NEXT:    vzeroupper diff --git a/test/CodeGen/X86/combine-and.ll b/test/CodeGen/X86/combine-and.ll index e92237f524f5..c2da74a1646c 100644 --- a/test/CodeGen/X86/combine-and.ll +++ b/test/CodeGen/X86/combine-and.ll @@ -220,6 +220,16 @@ define <4 x i32> @and_or_v4i32(<4 x i32> %a0) {    ret <4 x i32> %2  } +define <8 x i16> @and_or_v8i16(<8 x i16> %a0) { +; CHECK-LABEL: and_or_v8i16: +; CHECK:       # %bb.0: +; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [15,7,3,1,14,10,2,32767] +; CHECK-NEXT:    retq +  %1 = or <8 x i16> %a0, <i16 255, i16 127, i16 63, i16 31, i16 15, i16 31, i16 63, i16 -1> +  %2 = and <8 x i16> %1, <i16 15, i16 7, i16 3, i16 1, i16 14, i16 10, i16 2, i16 32767> +  ret <8 x i16> %2 +} +  ;  ; known bits folding  ; diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll index 1601c67dce25..65184c4d278e 100644 --- a/test/CodeGen/X86/combine-or.ll +++ b/test/CodeGen/X86/combine-or.ll @@ -430,25 +430,35 @@ define <4 x i32> @test2f(<4 x i32> %a, <4 x i32> %b) {    ret <4 x i32> %or  } -; TODO: Why would we do this? -; (or (and X, c1), c2) -> (and (or X, c2), c1|c2) +; (or (and X, c1), c2) -> (and (or X, c2), c1|c2) iff (c1 & c2) != 0  define <2 x i64> @or_and_v2i64(<2 x i64> %a0) {  ; CHECK-LABEL: or_and_v2i64:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0  ; CHECK-NEXT:    orps {{.*}}(%rip), %xmm0 +; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0  ; CHECK-NEXT:    retq    %1 = and <2 x i64> %a0, <i64 7, i64 7>    %2 = or <2 x i64> %1, <i64 3, i64 3>    ret <2 x i64> %2  } -; If all masked bits are going to be set, that's a constant fold. -  define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {  ; CHECK-LABEL: or_and_v4i32:  ; CHECK:       # %bb.0: +; CHECK-NEXT:    orps {{.*}}(%rip), %xmm0 +; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0 +; CHECK-NEXT:    retq +  %1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7> +  %2 = or <4 x i32> %1, <i32 3, i32 2, i32 15, i32 2> +  ret <4 x i32> %2 +} + +; If all masked bits are going to be set, that's a constant fold. + +define <4 x i32> @or_and_v4i32_fold(<4 x i32> %a0) { +; CHECK-LABEL: or_and_v4i32_fold: +; CHECK:       # %bb.0:  ; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [3,3,3,3]  ; CHECK-NEXT:    retq    %1 = and <4 x i32> %a0, <i32 1, i32 1, i32 1, i32 1> diff --git a/test/CodeGen/X86/darwin-bzero.ll b/test/CodeGen/X86/darwin-bzero.ll index 3099526028ab..410d67ff0ec1 100644 --- a/test/CodeGen/X86/darwin-bzero.ll +++ b/test/CodeGen/X86/darwin-bzero.ll @@ -1,7 +1,10 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin10 | grep __bzero +; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s  declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind +; CHECK-LABEL: foo: +; CHECK: {{calll|callq}} ___bzero  define void @foo(i8* %p, i32 %len) {    call void @llvm.memset.p0i8.i32(i8* %p, i8 0, i32 %len, i32 1, i1 false)    ret void diff --git a/test/CodeGen/X86/extractelement-index.ll b/test/CodeGen/X86/extractelement-index.ll index 4d24a15fe2e1..66bdfb8475f1 100644 --- a/test/CodeGen/X86/extractelement-index.ll +++ b/test/CodeGen/X86/extractelement-index.ll @@ -403,16 +403,14 @@ define i8 @extractelement_v16i8_var(<16 x i8> %a, i256 %i) nounwind {  ; SSE:       # %bb.0:  ; SSE-NEXT:    andl $15, %edi  ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax -; SSE-NEXT:    movb (%rdi,%rax), %al +; SSE-NEXT:    movb -24(%rsp,%rdi), %al  ; SSE-NEXT:    retq  ;  ; AVX-LABEL: extractelement_v16i8_var:  ; AVX:       # %bb.0:  ; AVX-NEXT:    andl $15, %edi  ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT:    movb (%rdi,%rax), %al +; AVX-NEXT:    movb -24(%rsp,%rdi), %al  ; AVX-NEXT:    retq    %b = extractelement <16 x i8> %a, i256 %i    ret i8 %b @@ -428,8 +426,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {  ; SSE-NEXT:    andl $31, %edi  ; SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)  ; SSE-NEXT:    movaps %xmm0, (%rsp) -; SSE-NEXT:    movq %rsp, %rax -; SSE-NEXT:    movb (%rdi,%rax), %al +; SSE-NEXT:    movb (%rsp,%rdi), %al  ; SSE-NEXT:    movq %rbp, %rsp  ; SSE-NEXT:    popq %rbp  ; SSE-NEXT:    retq @@ -442,8 +439,7 @@ define i8 @extractelement_v32i8_var(<32 x i8> %a, i256 %i) nounwind {  ; AVX-NEXT:    subq $64, %rsp  ; AVX-NEXT:    andl $31, %edi  ; AVX-NEXT:    vmovaps %ymm0, (%rsp) -; AVX-NEXT:    movq %rsp, %rax -; AVX-NEXT:    movb (%rdi,%rax), %al +; AVX-NEXT:    movb (%rsp,%rdi), %al  ; AVX-NEXT:    movq %rbp, %rsp  ; AVX-NEXT:    popq %rbp  ; AVX-NEXT:    vzeroupper diff --git a/test/CodeGen/X86/fma-fneg-combine.ll b/test/CodeGen/X86/fma-fneg-combine.ll index 8dacf2dcf971..a0e919d128df 100644 --- a/test/CodeGen/X86/fma-fneg-combine.ll +++ b/test/CodeGen/X86/fma-fneg-combine.ll @@ -140,21 +140,23 @@ declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x doubl  define <4 x float> @test11(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {  ; SKX-LABEL: test11:  ; SKX:       # %bb.0: # %entry -; SKX-NEXT:    vxorps {{.*}}(%rip){1to4}, %xmm2, %xmm0 +; SKX-NEXT:    vxorps {{.*}}(%rip){1to4}, %xmm2, %xmm2  ; SKX-NEXT:    kmovd %edi, %k1 -; SKX-NEXT:    vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1} +; SKX-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1} +; SKX-NEXT:    vmovaps %xmm2, %xmm0  ; SKX-NEXT:    retq  ;  ; KNL-LABEL: test11:  ; KNL:       # %bb.0: # %entry -; KNL-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-0,-0,-0,-0] -; KNL-NEXT:    vxorps %xmm0, %xmm2, %xmm0 +; KNL-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0] +; KNL-NEXT:    vxorps %xmm3, %xmm2, %xmm2  ; KNL-NEXT:    kmovw %edi, %k1 -; KNL-NEXT:    vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1} +; KNL-NEXT:    vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1} +; KNL-NEXT:    vmovaps %xmm2, %xmm0  ; KNL-NEXT:    retq  entry:    %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c -  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10 +  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10    ret <4 x float> %0  } @@ -164,19 +166,17 @@ define <4 x float> @test11b(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 z  ; SKX-LABEL: test11b:  ; SKX:       # %bb.0: # %entry  ; SKX-NEXT:    kmovd %edi, %k1 -; SKX-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm1 {%k1} -; SKX-NEXT:    vmovaps %xmm1, %xmm0 +; SKX-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1}  ; SKX-NEXT:    retq  ;  ; KNL-LABEL: test11b:  ; KNL:       # %bb.0: # %entry  ; KNL-NEXT:    kmovw %edi, %k1 -; KNL-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm1 {%k1} -; KNL-NEXT:    vmovaps %xmm1, %xmm0 +; KNL-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1}  ; KNL-NEXT:    retq  entry:    %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c -  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %b, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10 +  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10    ret <4 x float> %0  } @@ -305,3 +305,147 @@ define <8 x double> @test17(<8 x double> %a, <8 x double> %b, <8 x double> %c, i    ret <8 x double> %res  }  declare <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) + +define <4 x float> @test18(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 { +; SKX-LABEL: test18: +; SKX:       # %bb.0: # %entry +; SKX-NEXT:    kmovd %edi, %k1 +; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT:    retq +; +; KNL-LABEL: test18: +; KNL:       # %bb.0: # %entry +; KNL-NEXT:    kmovw %edi, %k1 +; KNL-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT:    retq +entry: +  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b +  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 4) #10 +  ret <4 x float> %0 +} + +define <4 x float> @test19(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 { +; SKX-LABEL: test19: +; SKX:       # %bb.0: # %entry +; SKX-NEXT:    kmovd %edi, %k1 +; SKX-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT:    retq +; +; KNL-LABEL: test19: +; KNL:       # %bb.0: # %entry +; KNL-NEXT:    kmovw %edi, %k1 +; KNL-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT:    retq +entry: +  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b +  %sub.i.2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c +  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %sub.i.2, i8 %mask, i32 4) #10 +  ret <4 x float> %0 +} + +define <4 x float> @test20(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 { +; SKX-LABEL: test20: +; SKX:       # %bb.0: # %entry +; SKX-NEXT:    kmovd %edi, %k1 +; SKX-NEXT:    vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1} +; SKX-NEXT:    vmovaps %xmm2, %xmm0 +; SKX-NEXT:    retq +; +; KNL-LABEL: test20: +; KNL:       # %bb.0: # %entry +; KNL-NEXT:    kmovw %edi, %k1 +; KNL-NEXT:    vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1} +; KNL-NEXT:    vmovaps %xmm2, %xmm0 +; KNL-NEXT:    retq +entry: +  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b +  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 4) #10 +  ret <4 x float> %0 +} + +define <4 x float> @test21(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 { +; SKX-LABEL: test21: +; SKX:       # %bb.0: # %entry +; SKX-NEXT:    kmovd %edi, %k1 +; SKX-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT:    retq +; +; KNL-LABEL: test21: +; KNL:       # %bb.0: # %entry +; KNL-NEXT:    kmovw %edi, %k1 +; KNL-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT:    retq +entry: +  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b +  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 8) #10 +  ret <4 x float> %0 +} + +define <4 x float> @test22(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 { +; SKX-LABEL: test22: +; SKX:       # %bb.0: # %entry +; SKX-NEXT:    kmovd %edi, %k1 +; SKX-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT:    retq +; +; KNL-LABEL: test22: +; KNL:       # %bb.0: # %entry +; KNL-NEXT:    kmovw %edi, %k1 +; KNL-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT:    retq +entry: +  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b +  %sub.i.2 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c +  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %sub.i.2, i8 %mask, i32 8) #10 +  ret <4 x float> %0 +} + +define <4 x float> @test23(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 { +; SKX-LABEL: test23: +; SKX:       # %bb.0: # %entry +; SKX-NEXT:    kmovd %edi, %k1 +; SKX-NEXT:    vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; SKX-NEXT:    vmovaps %xmm2, %xmm0 +; SKX-NEXT:    retq +; +; KNL-LABEL: test23: +; KNL:       # %bb.0: # %entry +; KNL-NEXT:    kmovw %edi, %k1 +; KNL-NEXT:    vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; KNL-NEXT:    vmovaps %xmm2, %xmm0 +; KNL-NEXT:    retq +entry: +  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b +  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 8) #10 +  ret <4 x float> %0 +} + +define <4 x float> @test24(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 { +; SKX-LABEL: test24: +; SKX:       # %bb.0: # %entry +; SKX-NEXT:    kmovd %edi, %k1 +; SKX-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; SKX-NEXT:    retq +; +; KNL-LABEL: test24: +; KNL:       # %bb.0: # %entry +; KNL-NEXT:    kmovw %edi, %k1 +; KNL-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; KNL-NEXT:    retq +entry: +  %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c +  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 8) #10 +  ret <4 x float> %0 +} + +define <16 x float> @test25(<16 x float> %a, <16 x float> %b, <16 x float> %c)  { +; CHECK-LABEL: test25: +; CHECK:       # %bb.0: # %entry +; CHECK-NEXT:    vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 +; CHECK-NEXT:    retq +entry: +  %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b +  %sub.i.2 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c +  %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a, <16 x float> %sub.i, <16 x float> %sub.i.2, i16 -1, i32 8) #2 +  ret <16 x float> %0 +} diff --git a/test/CodeGen/X86/fmsubadd-combine.ll b/test/CodeGen/X86/fmsubadd-combine.ll index 814d61e22382..ca2c61a88507 100644 --- a/test/CodeGen/X86/fmsubadd-combine.ll +++ b/test/CodeGen/X86/fmsubadd-combine.ll @@ -8,26 +8,17 @@  define <2 x double> @mul_subadd_pd128(<2 x double> %A, <2 x double> %B, <2 x double> %C) #0 {  ; FMA3_256-LABEL: mul_subadd_pd128:  ; FMA3_256:       # %bb.0: # %entry -; FMA3_256-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 -; FMA3_256-NEXT:    vsubpd %xmm2, %xmm0, %xmm1 -; FMA3_256-NEXT:    vaddpd %xmm2, %xmm0, %xmm0 -; FMA3_256-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; FMA3_256-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0  ; FMA3_256-NEXT:    retq  ;  ; FMA3_512-LABEL: mul_subadd_pd128:  ; FMA3_512:       # %bb.0: # %entry -; FMA3_512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 -; FMA3_512-NEXT:    vsubpd %xmm2, %xmm0, %xmm1 -; FMA3_512-NEXT:    vaddpd %xmm2, %xmm0, %xmm0 -; FMA3_512-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; FMA3_512-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0  ; FMA3_512-NEXT:    retq  ;  ; FMA4-LABEL: mul_subadd_pd128:  ; FMA4:       # %bb.0: # %entry -; FMA4-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 -; FMA4-NEXT:    vsubpd %xmm2, %xmm0, %xmm1 -; FMA4-NEXT:    vaddpd %xmm2, %xmm0, %xmm0 -; FMA4-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; FMA4-NEXT:    vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0  ; FMA4-NEXT:    retq  entry:    %AB = fmul <2 x double> %A, %B @@ -40,18 +31,12 @@ entry:  define <4 x float> @mul_subadd_ps128(<4 x float> %A, <4 x float> %B, <4 x float> %C) #0 {  ; FMA3-LABEL: mul_subadd_ps128:  ; FMA3:       # %bb.0: # %entry -; FMA3-NEXT:    vmulps %xmm1, %xmm0, %xmm0 -; FMA3-NEXT:    vsubps %xmm2, %xmm0, %xmm1 -; FMA3-NEXT:    vaddps %xmm2, %xmm0, %xmm0 -; FMA3-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; FMA3-NEXT:    vfmsubadd213ps  %xmm2, %xmm1, %xmm0  ; FMA3-NEXT:    retq  ;  ; FMA4-LABEL: mul_subadd_ps128:  ; FMA4:       # %bb.0: # %entry -; FMA4-NEXT:    vmulps %xmm1, %xmm0, %xmm0 -; FMA4-NEXT:    vsubps %xmm2, %xmm0, %xmm1 -; FMA4-NEXT:    vaddps %xmm2, %xmm0, %xmm0 -; FMA4-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] +; FMA4-NEXT:    vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0  ; FMA4-NEXT:    retq  entry:    %AB = fmul <4 x float> %A, %B @@ -64,18 +49,12 @@ entry:  define <4 x double> @mul_subadd_pd256(<4 x double> %A, <4 x double> %B, <4 x double> %C) #0 {  ; FMA3-LABEL: mul_subadd_pd256:  ; FMA3:       # %bb.0: # %entry -; FMA3-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 -; FMA3-NEXT:    vsubpd %ymm2, %ymm0, %ymm1 -; FMA3-NEXT:    vaddpd %ymm2, %ymm0, %ymm0 -; FMA3-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; FMA3-NEXT:    vfmsubadd213pd  %ymm2, %ymm1, %ymm0  ; FMA3-NEXT:    retq  ;  ; FMA4-LABEL: mul_subadd_pd256:  ; FMA4:       # %bb.0: # %entry -; FMA4-NEXT:    vmulpd %ymm1, %ymm0, %ymm0 -; FMA4-NEXT:    vsubpd %ymm2, %ymm0, %ymm1 -; FMA4-NEXT:    vaddpd %ymm2, %ymm0, %ymm0 -; FMA4-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; FMA4-NEXT:    vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0  ; FMA4-NEXT:    retq  entry:    %AB = fmul <4 x double> %A, %B @@ -88,18 +67,12 @@ entry:  define <8 x float> @mul_subadd_ps256(<8 x float> %A, <8 x float> %B, <8 x float> %C) #0 {  ; FMA3-LABEL: mul_subadd_ps256:  ; FMA3:       # %bb.0: # %entry -; FMA3-NEXT:    vmulps %ymm1, %ymm0, %ymm0 -; FMA3-NEXT:    vsubps %ymm2, %ymm0, %ymm1 -; FMA3-NEXT:    vaddps %ymm2, %ymm0, %ymm0 -; FMA3-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; FMA3-NEXT:    vfmsubadd213ps  %ymm2, %ymm1, %ymm0  ; FMA3-NEXT:    retq  ;  ; FMA4-LABEL: mul_subadd_ps256:  ; FMA4:       # %bb.0: # %entry -; FMA4-NEXT:    vmulps %ymm1, %ymm0, %ymm0 -; FMA4-NEXT:    vsubps %ymm2, %ymm0, %ymm1 -; FMA4-NEXT:    vaddps %ymm2, %ymm0, %ymm0 -; FMA4-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; FMA4-NEXT:    vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0  ; FMA4-NEXT:    retq  entry:    %AB = fmul <8 x float> %A, %B @@ -112,34 +85,19 @@ entry:  define <8 x double> @mul_subadd_pd512(<8 x double> %A, <8 x double> %B, <8 x double> %C) #0 {  ; FMA3_256-LABEL: mul_subadd_pd512:  ; FMA3_256:       # %bb.0: # %entry -; FMA3_256-NEXT:    vmulpd %ymm2, %ymm0, %ymm0 -; FMA3_256-NEXT:    vmulpd %ymm3, %ymm1, %ymm1 -; FMA3_256-NEXT:    vsubpd %ymm5, %ymm1, %ymm2 -; FMA3_256-NEXT:    vsubpd %ymm4, %ymm0, %ymm3 -; FMA3_256-NEXT:    vaddpd %ymm5, %ymm1, %ymm1 -; FMA3_256-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] -; FMA3_256-NEXT:    vaddpd %ymm4, %ymm0, %ymm0 -; FMA3_256-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] +; FMA3_256-NEXT:    vfmsubadd213pd  %ymm4, %ymm2, %ymm0 +; FMA3_256-NEXT:    vfmsubadd213pd  %ymm5, %ymm3, %ymm1  ; FMA3_256-NEXT:    retq  ;  ; FMA3_512-LABEL: mul_subadd_pd512:  ; FMA3_512:       # %bb.0: # %entry -; FMA3_512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0 -; FMA3_512-NEXT:    vsubpd %zmm2, %zmm0, %zmm1 -; FMA3_512-NEXT:    vaddpd %zmm2, %zmm0, %zmm0 -; FMA3_512-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[7] +; FMA3_512-NEXT:    vfmsubadd213pd  %zmm2, %zmm1, %zmm0  ; FMA3_512-NEXT:    retq  ;  ; FMA4-LABEL: mul_subadd_pd512:  ; FMA4:       # %bb.0: # %entry -; FMA4-NEXT:    vmulpd %ymm2, %ymm0, %ymm0 -; FMA4-NEXT:    vmulpd %ymm3, %ymm1, %ymm1 -; FMA4-NEXT:    vsubpd %ymm5, %ymm1, %ymm2 -; FMA4-NEXT:    vsubpd %ymm4, %ymm0, %ymm3 -; FMA4-NEXT:    vaddpd %ymm5, %ymm1, %ymm1 -; FMA4-NEXT:    vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] -; FMA4-NEXT:    vaddpd %ymm4, %ymm0, %ymm0 -; FMA4-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3] +; FMA4-NEXT:    vfmsubaddpd %ymm4, %ymm2, %ymm0, %ymm0 +; FMA4-NEXT:    vfmsubaddpd %ymm5, %ymm3, %ymm1, %ymm1  ; FMA4-NEXT:    retq  entry:    %AB = fmul <8 x double> %A, %B @@ -152,35 +110,19 @@ entry:  define <16 x float> @mul_subadd_ps512(<16 x float> %A, <16 x float> %B, <16 x float> %C) #0 {  ; FMA3_256-LABEL: mul_subadd_ps512:  ; FMA3_256:       # %bb.0: # %entry -; FMA3_256-NEXT:    vmulps %ymm2, %ymm0, %ymm0 -; FMA3_256-NEXT:    vmulps %ymm3, %ymm1, %ymm1 -; FMA3_256-NEXT:    vsubps %ymm5, %ymm1, %ymm2 -; FMA3_256-NEXT:    vsubps %ymm4, %ymm0, %ymm3 -; FMA3_256-NEXT:    vaddps %ymm5, %ymm1, %ymm1 -; FMA3_256-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; FMA3_256-NEXT:    vaddps %ymm4, %ymm0, %ymm0 -; FMA3_256-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] +; FMA3_256-NEXT:    vfmsubadd213ps  %ymm4, %ymm2, %ymm0 +; FMA3_256-NEXT:    vfmsubadd213ps  %ymm5, %ymm3, %ymm1  ; FMA3_256-NEXT:    retq  ;  ; FMA3_512-LABEL: mul_subadd_ps512:  ; FMA3_512:       # %bb.0: # %entry -; FMA3_512-NEXT:    vmulps %zmm1, %zmm0, %zmm1 -; FMA3_512-NEXT:    vaddps %zmm2, %zmm1, %zmm0 -; FMA3_512-NEXT:    movw $-21846, %ax # imm = 0xAAAA -; FMA3_512-NEXT:    kmovw %eax, %k1 -; FMA3_512-NEXT:    vsubps %zmm2, %zmm1, %zmm0 {%k1} +; FMA3_512-NEXT:    vfmsubadd213ps  %zmm2, %zmm1, %zmm0  ; FMA3_512-NEXT:    retq  ;  ; FMA4-LABEL: mul_subadd_ps512:  ; FMA4:       # %bb.0: # %entry -; FMA4-NEXT:    vmulps %ymm2, %ymm0, %ymm0 -; FMA4-NEXT:    vmulps %ymm3, %ymm1, %ymm1 -; FMA4-NEXT:    vsubps %ymm5, %ymm1, %ymm2 -; FMA4-NEXT:    vsubps %ymm4, %ymm0, %ymm3 -; FMA4-NEXT:    vaddps %ymm5, %ymm1, %ymm1 -; FMA4-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] -; FMA4-NEXT:    vaddps %ymm4, %ymm0, %ymm0 -; FMA4-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] +; FMA4-NEXT:    vfmsubaddps %ymm4, %ymm2, %ymm0, %ymm0 +; FMA4-NEXT:    vfmsubaddps %ymm5, %ymm3, %ymm1, %ymm1  ; FMA4-NEXT:    retq  entry:    %AB = fmul <16 x float> %A, %B diff --git a/test/CodeGen/X86/fold-vector-sext-crash.ll b/test/CodeGen/X86/fold-vector-sext-crash.ll index 481f55e9e10d..db73195698e3 100644 --- a/test/CodeGen/X86/fold-vector-sext-crash.ll +++ b/test/CodeGen/X86/fold-vector-sext-crash.ll @@ -9,9 +9,9 @@  define <4 x i64> @foo(<4 x i64> %A) {  ; CHECK-LABEL: foo:  ; CHECK:       # %bb.0: -; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT:    vmovdqa %xmm1, %xmm1 -; CHECK-NEXT:    vandps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT:    vmovapd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295,0,0,0,0] +; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT:    vblendvpd %ymm1, %ymm0, %ymm2, %ymm0  ; CHECK-NEXT:    retl    %1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i64> %A, <4 x i64><i64 undef, i64 undef, i64 0, i64 0>    ret <4 x i64> %1 diff --git a/test/CodeGen/X86/horizontal-reduce-smax.ll b/test/CodeGen/X86/horizontal-reduce-smax.ll index a54e01d9af67..fa92158ae92d 100644 --- a/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -309,30 +309,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {  ;  ; X86-SSE42-LABEL: test_reduce_v16i8:  ; X86-SSE42:       ## %bb.0: -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0 -; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT:    psrld $16, %xmm1 -; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT:    psrlw $8, %xmm0 -; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT:    psrlw $8, %xmm2 +; X86-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-SSE42-NEXT:    retl  ;  ; X86-AVX-LABEL: test_reduce_v16i8:  ; X86-AVX:       ## %bb.0: -; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X86-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0 +; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX-NEXT:    retl @@ -371,30 +366,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {  ;  ; X64-SSE42-LABEL: test_reduce_v16i8:  ; X64-SSE42:       ## %bb.0: -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0 -; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT:    psrld $16, %xmm1 -; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT:    psrlw $8, %xmm0 -; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT:    psrlw $8, %xmm2 +; X64-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-SSE42-NEXT:    retq  ;  ; X64-AVX-LABEL: test_reduce_v16i8:  ; X64-AVX:       ## %bb.0: -; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX-NEXT:    retq @@ -906,16 +896,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X86-SSE42-LABEL: test_reduce_v32i8:  ; X86-SSE42:       ## %bb.0:  ; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0 -; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT:    psrld $16, %xmm1 -; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT:    psrlw $8, %xmm0 -; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT:    psrlw $8, %xmm2 +; X86-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-SSE42-NEXT:    retl @@ -924,14 +911,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X86-AVX1:       ## %bb.0:  ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX1-NEXT:    vzeroupper @@ -940,15 +925,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X86-AVX2-LABEL: test_reduce_v32i8:  ; X86-AVX2:       ## %bb.0:  ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX2-NEXT:    vzeroupper @@ -994,16 +977,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X64-SSE42-LABEL: test_reduce_v32i8:  ; X64-SSE42:       ## %bb.0:  ; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0 -; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT:    psrld $16, %xmm1 -; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT:    psrlw $8, %xmm0 -; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT:    psrlw $8, %xmm2 +; X64-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-SSE42-NEXT:    retq @@ -1012,14 +992,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X64-AVX1:       ## %bb.0:  ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX1-NEXT:    vzeroupper @@ -1028,15 +1006,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X64-AVX2-LABEL: test_reduce_v32i8:  ; X64-AVX2:       ## %bb.0:  ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX2-NEXT:    vzeroupper @@ -1045,15 +1021,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X64-AVX512-LABEL: test_reduce_v32i8:  ; X64-AVX512:       ## %bb.0:  ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX512-NEXT:    vzeroupper @@ -1743,16 +1717,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X86-SSE42-NEXT:    pmaxsb %xmm3, %xmm1  ; X86-SSE42-NEXT:    pmaxsb %xmm2, %xmm0  ; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0 -; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT:    psrld $16, %xmm1 -; X86-SSE42-NEXT:    pmaxsb %xmm0, %xmm1 -; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT:    psrlw $8, %xmm0 -; X86-SSE42-NEXT:    pmaxsb %xmm1, %xmm0 +; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT:    psrlw $8, %xmm2 +; X86-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-SSE42-NEXT:    retl @@ -1764,14 +1735,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X86-AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2  ; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0  ; X86-AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX1-NEXT:    vzeroupper @@ -1781,15 +1750,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X86-AVX2:       ## %bb.0:  ; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0  ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX2-NEXT:    vzeroupper @@ -1847,16 +1814,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X64-SSE42-NEXT:    pmaxsb %xmm3, %xmm1  ; X64-SSE42-NEXT:    pmaxsb %xmm2, %xmm0  ; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0 -; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT:    psrld $16, %xmm1 -; X64-SSE42-NEXT:    pmaxsb %xmm0, %xmm1 -; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT:    psrlw $8, %xmm0 -; X64-SSE42-NEXT:    pmaxsb %xmm1, %xmm0 +; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT:    psrlw $8, %xmm2 +; X64-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-SSE42-NEXT:    retq @@ -1868,14 +1832,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X64-AVX1-NEXT:    vpmaxsb %xmm2, %xmm3, %xmm2  ; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0  ; X64-AVX1-NEXT:    vpmaxsb %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX1-NEXT:    vzeroupper @@ -1885,15 +1847,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X64-AVX2:       ## %bb.0:  ; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0  ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX2-NEXT:    vzeroupper @@ -1902,17 +1862,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X64-AVX512-LABEL: test_reduce_v64i8:  ; X64-AVX512:       ## %bb.0:  ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0  ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX512-NEXT:    vzeroupper diff --git a/test/CodeGen/X86/horizontal-reduce-smin.ll b/test/CodeGen/X86/horizontal-reduce-smin.ll index f03e745598e6..fa5828a45700 100644 --- a/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -311,30 +311,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {  ;  ; X86-SSE42-LABEL: test_reduce_v16i8:  ; X86-SSE42:       ## %bb.0: -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0 -; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT:    psrld $16, %xmm1 -; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT:    psrlw $8, %xmm0 -; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT:    psrlw $8, %xmm2 +; X86-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-SSE42-NEXT:    retl  ;  ; X86-AVX-LABEL: test_reduce_v16i8:  ; X86-AVX:       ## %bb.0: -; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X86-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0 +; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX-NEXT:    retl @@ -373,30 +368,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {  ;  ; X64-SSE42-LABEL: test_reduce_v16i8:  ; X64-SSE42:       ## %bb.0: -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0 -; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT:    psrld $16, %xmm1 -; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT:    psrlw $8, %xmm0 -; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT:    psrlw $8, %xmm2 +; X64-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-SSE42-NEXT:    retq  ;  ; X64-AVX-LABEL: test_reduce_v16i8:  ; X64-AVX:       ## %bb.0: -; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX-NEXT:    retq @@ -910,16 +900,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X86-SSE42-LABEL: test_reduce_v32i8:  ; X86-SSE42:       ## %bb.0:  ; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0 -; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT:    psrld $16, %xmm1 -; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT:    psrlw $8, %xmm0 -; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT:    psrlw $8, %xmm2 +; X86-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-SSE42-NEXT:    retl @@ -928,14 +915,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X86-AVX1:       ## %bb.0:  ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX1-NEXT:    vzeroupper @@ -944,15 +929,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X86-AVX2-LABEL: test_reduce_v32i8:  ; X86-AVX2:       ## %bb.0:  ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX2-NEXT:    vzeroupper @@ -998,16 +981,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X64-SSE42-LABEL: test_reduce_v32i8:  ; X64-SSE42:       ## %bb.0:  ; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0 -; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT:    psrld $16, %xmm1 -; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT:    psrlw $8, %xmm0 -; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT:    psrlw $8, %xmm2 +; X64-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-SSE42-NEXT:    retq @@ -1016,14 +996,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X64-AVX1:       ## %bb.0:  ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX1-NEXT:    vzeroupper @@ -1032,15 +1010,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X64-AVX2-LABEL: test_reduce_v32i8:  ; X64-AVX2:       ## %bb.0:  ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX2-NEXT:    vzeroupper @@ -1049,15 +1025,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X64-AVX512-LABEL: test_reduce_v32i8:  ; X64-AVX512:       ## %bb.0:  ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX512-NEXT:    vzeroupper @@ -1745,16 +1719,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X86-SSE42-NEXT:    pminsb %xmm3, %xmm1  ; X86-SSE42-NEXT:    pminsb %xmm2, %xmm0  ; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0 -; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT:    psrld $16, %xmm1 -; X86-SSE42-NEXT:    pminsb %xmm0, %xmm1 -; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT:    psrlw $8, %xmm0 -; X86-SSE42-NEXT:    pminsb %xmm1, %xmm0 +; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT:    psrlw $8, %xmm2 +; X86-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-SSE42-NEXT:    retl @@ -1766,14 +1737,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X86-AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2  ; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0  ; X86-AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX1-NEXT:    vzeroupper @@ -1783,15 +1752,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X86-AVX2:       ## %bb.0:  ; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0  ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX2-NEXT:    vzeroupper @@ -1849,16 +1816,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X64-SSE42-NEXT:    pminsb %xmm3, %xmm1  ; X64-SSE42-NEXT:    pminsb %xmm2, %xmm0  ; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0 -; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT:    psrld $16, %xmm1 -; X64-SSE42-NEXT:    pminsb %xmm0, %xmm1 -; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT:    psrlw $8, %xmm0 -; X64-SSE42-NEXT:    pminsb %xmm1, %xmm0 +; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT:    psrlw $8, %xmm2 +; X64-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-SSE42-NEXT:    retq @@ -1870,14 +1834,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X64-AVX1-NEXT:    vpminsb %xmm2, %xmm3, %xmm2  ; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0  ; X64-AVX1-NEXT:    vpminsb %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX1-NEXT:    vzeroupper @@ -1887,15 +1849,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X64-AVX2:       ## %bb.0:  ; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0  ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT:    vpminsb %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX2-NEXT:    vzeroupper @@ -1904,17 +1864,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X64-AVX512-LABEL: test_reduce_v64i8:  ; X64-AVX512:       ## %bb.0:  ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT:    vpminsb %ymm1, %ymm0, %ymm0  ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT:    vpminsb %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX512-NEXT:    vzeroupper diff --git a/test/CodeGen/X86/horizontal-reduce-umax.ll b/test/CodeGen/X86/horizontal-reduce-umax.ll index 52e623b82718..204479976e90 100644 --- a/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -362,30 +362,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {  ;  ; X86-SSE42-LABEL: test_reduce_v16i8:  ; X86-SSE42:       ## %bb.0: -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0 -; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT:    psrld $16, %xmm1 -; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT:    psrlw $8, %xmm0 -; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT:    psrlw $8, %xmm2 +; X86-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-SSE42-NEXT:    retl  ;  ; X86-AVX-LABEL: test_reduce_v16i8:  ; X86-AVX:       ## %bb.0: -; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X86-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0 +; X86-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX-NEXT:    retl @@ -408,30 +403,25 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {  ;  ; X64-SSE42-LABEL: test_reduce_v16i8:  ; X64-SSE42:       ## %bb.0: -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0 -; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT:    psrld $16, %xmm1 -; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT:    psrlw $8, %xmm0 -; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT:    psrlw $8, %xmm2 +; X64-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-SSE42-NEXT:    retq  ;  ; X64-AVX-LABEL: test_reduce_v16i8:  ; X64-AVX:       ## %bb.0: -; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX-NEXT:    retq @@ -1031,16 +1021,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X86-SSE42-LABEL: test_reduce_v32i8:  ; X86-SSE42:       ## %bb.0:  ; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0 -; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT:    psrld $16, %xmm1 -; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT:    psrlw $8, %xmm0 -; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT:    psrlw $8, %xmm2 +; X86-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-SSE42-NEXT:    retl @@ -1049,14 +1036,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X86-AVX1:       ## %bb.0:  ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX1-NEXT:    vzeroupper @@ -1065,15 +1050,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X86-AVX2-LABEL: test_reduce_v32i8:  ; X86-AVX2:       ## %bb.0:  ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX2-NEXT:    vzeroupper @@ -1099,16 +1082,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X64-SSE42-LABEL: test_reduce_v32i8:  ; X64-SSE42:       ## %bb.0:  ; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0 -; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT:    psrld $16, %xmm1 -; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT:    psrlw $8, %xmm0 -; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT:    psrlw $8, %xmm2 +; X64-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-SSE42-NEXT:    retq @@ -1117,14 +1097,12 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X64-AVX1:       ## %bb.0:  ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX1-NEXT:    vzeroupper @@ -1133,15 +1111,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X64-AVX2-LABEL: test_reduce_v32i8:  ; X64-AVX2:       ## %bb.0:  ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX2-NEXT:    vzeroupper @@ -1150,15 +1126,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X64-AVX512-LABEL: test_reduce_v32i8:  ; X64-AVX512:       ## %bb.0:  ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX512-NEXT:    vzeroupper @@ -1992,16 +1966,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X86-SSE42-NEXT:    pmaxub %xmm3, %xmm1  ; X86-SSE42-NEXT:    pmaxub %xmm2, %xmm0  ; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0 -; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT:    psrld $16, %xmm1 -; X86-SSE42-NEXT:    pmaxub %xmm0, %xmm1 -; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT:    psrlw $8, %xmm0 -; X86-SSE42-NEXT:    pmaxub %xmm1, %xmm0 +; X86-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1 +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X86-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X86-SSE42-NEXT:    psrlw $8, %xmm2 +; X86-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X86-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X86-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-SSE42-NEXT:    retl @@ -2013,14 +1984,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X86-AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2  ; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0  ; X86-AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0 +; X86-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX1-NEXT:    vzeroupper @@ -2030,15 +1999,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X86-AVX2:       ## %bb.0:  ; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0  ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 +; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X86-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0 +; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX2-NEXT:    vzeroupper @@ -2068,16 +2035,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X64-SSE42-NEXT:    pmaxub %xmm3, %xmm1  ; X64-SSE42-NEXT:    pmaxub %xmm2, %xmm0  ; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0 -; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT:    psrld $16, %xmm1 -; X64-SSE42-NEXT:    pmaxub %xmm0, %xmm1 -; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT:    psrlw $8, %xmm0 -; X64-SSE42-NEXT:    pmaxub %xmm1, %xmm0 +; X64-SSE42-NEXT:    pcmpeqd %xmm1, %xmm1 +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0 +; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2 +; X64-SSE42-NEXT:    psrlw $8, %xmm2 +; X64-SSE42-NEXT:    pminub %xmm0, %xmm2 +; X64-SSE42-NEXT:    phminposuw %xmm2, %xmm0 +; X64-SSE42-NEXT:    pxor %xmm1, %xmm0  ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-SSE42-NEXT:    retq @@ -2089,14 +2053,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X64-AVX1-NEXT:    vpmaxub %xmm2, %xmm3, %xmm2  ; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0  ; X64-AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX1-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX1-NEXT:    vzeroupper @@ -2106,15 +2068,13 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X64-AVX2:       ## %bb.0:  ; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0  ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX2-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX2-NEXT:    vzeroupper @@ -2123,17 +2083,15 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X64-AVX512-LABEL: test_reduce_v64i8:  ; X64-AVX512:       ## %bb.0:  ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0  ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 +; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm2 +; X64-AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0 +; X64-AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0  ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX512-NEXT:    vzeroupper diff --git a/test/CodeGen/X86/horizontal-reduce-umin.ll b/test/CodeGen/X86/horizontal-reduce-umin.ll index 505663656a3a..2a37d17365be 100644 --- a/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -352,30 +352,19 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {  ;  ; X86-SSE42-LABEL: test_reduce_v16i8:  ; X86-SSE42:       ## %bb.0: -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT:    pminub %xmm0, %xmm1 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT:    pminub %xmm1, %xmm0  ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT:    psrld $16, %xmm1 +; X86-SSE42-NEXT:    psrlw $8, %xmm1  ; X86-SSE42-NEXT:    pminub %xmm0, %xmm1 -; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT:    psrlw $8, %xmm0 -; X86-SSE42-NEXT:    pminub %xmm1, %xmm0 +; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0  ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-SSE42-NEXT:    retl  ;  ; X86-AVX-LABEL: test_reduce_v16i8:  ; X86-AVX:       ## %bb.0: -; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0  ; X86-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1  ; X86-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT:    vphminposuw %xmm0, %xmm0  ; X86-AVX-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX-NEXT:    retl @@ -398,30 +387,19 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {  ;  ; X64-SSE42-LABEL: test_reduce_v16i8:  ; X64-SSE42:       ## %bb.0: -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT:    pminub %xmm0, %xmm1 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT:    pminub %xmm1, %xmm0  ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT:    psrld $16, %xmm1 +; X64-SSE42-NEXT:    psrlw $8, %xmm1  ; X64-SSE42-NEXT:    pminub %xmm0, %xmm1 -; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT:    psrlw $8, %xmm0 -; X64-SSE42-NEXT:    pminub %xmm1, %xmm0 +; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0  ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-SSE42-NEXT:    retq  ;  ; X64-AVX-LABEL: test_reduce_v16i8:  ; X64-AVX:       ## %bb.0: -; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0  ; X64-AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1  ; X64-AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT:    vphminposuw %xmm0, %xmm0  ; X64-AVX-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX-NEXT:    retq @@ -1004,16 +982,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X86-SSE42-LABEL: test_reduce_v32i8:  ; X86-SSE42:       ## %bb.0:  ; X86-SSE42-NEXT:    pminub %xmm1, %xmm0 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT:    pminub %xmm0, %xmm1 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT:    pminub %xmm1, %xmm0  ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT:    psrld $16, %xmm1 +; X86-SSE42-NEXT:    psrlw $8, %xmm1  ; X86-SSE42-NEXT:    pminub %xmm0, %xmm1 -; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT:    psrlw $8, %xmm0 -; X86-SSE42-NEXT:    pminub %xmm1, %xmm0 +; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0  ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-SSE42-NEXT:    retl @@ -1022,14 +994,9 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X86-AVX1:       ## %bb.0:  ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0  ; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1  ; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0  ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX1-NEXT:    vzeroupper @@ -1038,15 +1005,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X86-AVX2-LABEL: test_reduce_v32i8:  ; X86-AVX2:       ## %bb.0:  ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0  ; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0  ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX2-NEXT:    vzeroupper @@ -1072,16 +1034,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X64-SSE42-LABEL: test_reduce_v32i8:  ; X64-SSE42:       ## %bb.0:  ; X64-SSE42-NEXT:    pminub %xmm1, %xmm0 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT:    pminub %xmm0, %xmm1 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT:    pminub %xmm1, %xmm0  ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT:    psrld $16, %xmm1 +; X64-SSE42-NEXT:    psrlw $8, %xmm1  ; X64-SSE42-NEXT:    pminub %xmm0, %xmm1 -; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT:    psrlw $8, %xmm0 -; X64-SSE42-NEXT:    pminub %xmm1, %xmm0 +; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0  ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-SSE42-NEXT:    retq @@ -1090,14 +1046,9 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X64-AVX1:       ## %bb.0:  ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1  ; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0  ; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1  ; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0  ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX1-NEXT:    vzeroupper @@ -1106,15 +1057,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X64-AVX2-LABEL: test_reduce_v32i8:  ; X64-AVX2:       ## %bb.0:  ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0  ; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0  ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX2-NEXT:    vzeroupper @@ -1123,15 +1069,10 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {  ; X64-AVX512-LABEL: test_reduce_v32i8:  ; X64-AVX512:       ## %bb.0:  ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0  ; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0  ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX512-NEXT:    vzeroupper @@ -1942,16 +1883,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X86-SSE42-NEXT:    pminub %xmm3, %xmm1  ; X86-SSE42-NEXT:    pminub %xmm2, %xmm0  ; X86-SSE42-NEXT:    pminub %xmm1, %xmm0 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-SSE42-NEXT:    pminub %xmm0, %xmm1 -; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X86-SSE42-NEXT:    pminub %xmm1, %xmm0  ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X86-SSE42-NEXT:    psrld $16, %xmm1 +; X86-SSE42-NEXT:    psrlw $8, %xmm1  ; X86-SSE42-NEXT:    pminub %xmm0, %xmm1 -; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X86-SSE42-NEXT:    psrlw $8, %xmm0 -; X86-SSE42-NEXT:    pminub %xmm1, %xmm0 +; X86-SSE42-NEXT:    phminposuw %xmm1, %xmm0  ; X86-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X86-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-SSE42-NEXT:    retl @@ -1963,14 +1898,9 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X86-AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2  ; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0  ; X86-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0 -; X86-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0  ; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1  ; X86-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT:    vphminposuw %xmm0, %xmm0  ; X86-AVX1-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX1-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX1-NEXT:    vzeroupper @@ -1980,15 +1910,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X86-AVX2:       ## %bb.0:  ; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0  ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0  ; X86-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X86-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT:    vphminposuw %xmm0, %xmm0  ; X86-AVX2-NEXT:    vpextrb $0, %xmm0, %eax  ; X86-AVX2-NEXT:    ## kill: def %al killed %al killed %eax  ; X86-AVX2-NEXT:    vzeroupper @@ -2018,16 +1943,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X64-SSE42-NEXT:    pminub %xmm3, %xmm1  ; X64-SSE42-NEXT:    pminub %xmm2, %xmm0  ; X64-SSE42-NEXT:    pminub %xmm1, %xmm0 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-SSE42-NEXT:    pminub %xmm0, %xmm1 -; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; X64-SSE42-NEXT:    pminub %xmm1, %xmm0  ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1 -; X64-SSE42-NEXT:    psrld $16, %xmm1 +; X64-SSE42-NEXT:    psrlw $8, %xmm1  ; X64-SSE42-NEXT:    pminub %xmm0, %xmm1 -; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0 -; X64-SSE42-NEXT:    psrlw $8, %xmm0 -; X64-SSE42-NEXT:    pminub %xmm1, %xmm0 +; X64-SSE42-NEXT:    phminposuw %xmm1, %xmm0  ; X64-SSE42-NEXT:    pextrb $0, %xmm0, %eax  ; X64-SSE42-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-SSE42-NEXT:    retq @@ -2039,14 +1958,9 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X64-AVX1-NEXT:    vpminub %xmm2, %xmm3, %xmm2  ; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0  ; X64-AVX1-NEXT:    vpminub %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0  ; X64-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1  ; X64-AVX1-NEXT:    vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT:    vphminposuw %xmm0, %xmm0  ; X64-AVX1-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX1-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX1-NEXT:    vzeroupper @@ -2056,15 +1970,10 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X64-AVX2:       ## %bb.0:  ; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0  ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0  ; X64-AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX2-NEXT:    vpminub %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT:    vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT:    vphminposuw %xmm0, %xmm0  ; X64-AVX2-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX2-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX2-NEXT:    vzeroupper @@ -2073,17 +1982,12 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {  ; X64-AVX512-LABEL: test_reduce_v64i8:  ; X64-AVX512:       ## %bb.0:  ; X64-AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 -; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT:    vpminub %ymm1, %ymm0, %ymm0  ; X64-AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0 -; X64-AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1 -; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0  ; X64-AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm1 -; X64-AVX512-NEXT:    vpminub %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm0 +; X64-AVX512-NEXT:    vphminposuw %xmm0, %xmm0  ; X64-AVX512-NEXT:    vpextrb $0, %xmm0, %eax  ; X64-AVX512-NEXT:    ## kill: def %al killed %al killed %eax  ; X64-AVX512-NEXT:    vzeroupper diff --git a/test/CodeGen/X86/known-bits-vector.ll b/test/CodeGen/X86/known-bits-vector.ll index 283d1f93dfb6..46a888f3b9b6 100644 --- a/test/CodeGen/X86/known-bits-vector.ll +++ b/test/CodeGen/X86/known-bits-vector.ll @@ -160,17 +160,19 @@ define <4 x float> @knownbits_mask_shuffle_uitofp(<4 x i32> %a0) nounwind {  define <4 x float> @knownbits_mask_or_shuffle_uitofp(<4 x i32> %a0) nounwind {  ; X32-LABEL: knownbits_mask_or_shuffle_uitofp:  ; X32:       # %bb.0: -; X32-NEXT:    vandps {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT:    vorps {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; X32-NEXT:    vpor {{\.LCPI.*}}, %xmm0, %xmm0 +; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; X32-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6] +; X32-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]  ; X32-NEXT:    vcvtdq2ps %xmm0, %xmm0  ; X32-NEXT:    retl  ;  ; X64-LABEL: knownbits_mask_or_shuffle_uitofp:  ; X64:       # %bb.0: -; X64-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT:    vorps {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; X64-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0 +; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; X64-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6] +; X64-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]  ; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0  ; X64-NEXT:    retq    %1 = and <4 x i32> %a0, <i32 -1, i32 -1, i32 255, i32 4085> diff --git a/test/CodeGen/X86/machinesink-merge-debuginfo.ll b/test/CodeGen/X86/machinesink-merge-debuginfo.ll index d8fcea1872e8..f5023bbeb5f9 100644 --- a/test/CodeGen/X86/machinesink-merge-debuginfo.ll +++ b/test/CodeGen/X86/machinesink-merge-debuginfo.ll @@ -5,6 +5,21 @@ source_filename = "test-sink-debug.cpp"  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"  target triple = "x86_64-unknown-linux-gnu" +; double foo(double x, double y, bool c) { +;   double a = x / 3.0; +;   double b = y / 5.0; +;   double ret; +; +;   if (c) +;      ret = a + 1.0; +;   else +;      ret = b + 1.0; +; +;   ret = ret + 1.0; +; +;   return ret; +; } +  ; Function Attrs: nounwind readnone uwtable  define double @_Z3fooddb(double %x, double %y, i1 zeroext %c) local_unnamed_addr !dbg !7 {    tail call void @llvm.dbg.value(metadata double %x, metadata !13, metadata !DIExpression()), !dbg !16 @@ -17,9 +32,10 @@ first:    %e = fadd double %a, 1.000000e+00    br label %final  second: -  %f = fadd double %b, 1.000000e+00, !dbg !17 -; CHECK:  debug-location !17 -; CHECK:  debug-location !17 +; CHECK-NOT:  debug-location !17 +; CHECK:  debug-location !18 +; CHECK-NOT:  debug-location !17 +  %f = fadd double %b, 1.000000e+00, !dbg !18    br label %final  final:    %cond = phi double [%e, %first], [%f, %second] @@ -27,15 +43,39 @@ final:    ret double %d  } -; Function Attrs: nounwind readnone speculatable -declare void @llvm.dbg.value(metadata, metadata, metadata) #1 -attributes #1 = { nounwind readnone speculatable } + +; Function Attrs: nounwind readnone uwtable +define double @_Z4foo1ddb(double %x, double %y, i1 zeroext %c) local_unnamed_addr !dbg !19 { +  tail call void @llvm.dbg.value(metadata double %x, metadata !21, metadata !DIExpression()), !dbg !24 +  tail call void @llvm.dbg.value(metadata double %y, metadata !22, metadata !DIExpression()), !dbg !24 +  tail call void @llvm.dbg.value(metadata i1 %c, metadata !23, metadata !DIExpression()), !dbg !24 +  %a = fdiv double %x, 3.000000e+00 +  %b = fdiv double %y, 5.000000e+00, !dbg !25 +  br i1 %c, label %first, label %second +first: +  %e = fadd double %a, 1.000000e+00 +  br label %final +second: +  %f = fadd double %b, 1.000000e+00, !dbg !25 +; CHECK:  debug-location !25 +; CHECK-NEXT:  debug-location !25 +  br label %final +final: +  %cond = phi double [%e, %first], [%f, %second] +  %d = fadd double %cond, 1.000000e+00 +  ret double %d +}  !llvm.dbg.cu = !{!0}  !llvm.module.flags = !{!3, !4, !5}  !llvm.ident = !{!6} +attributes #1 = { nounwind readnone speculatable } + +; Function Attrs: nounwind readnone speculatable +declare void @llvm.dbg.value(metadata, metadata, metadata) #1 +  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 6.0.0 (trunk 313291)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)  !1 = !DIFile(filename: "test-sink-debug.cpp", directory: "/tmp")  !2 = !{} @@ -54,3 +94,11 @@ attributes #1 = { nounwind readnone speculatable }  !15 = !DILocalVariable(name: "c", arg: 3, scope: !7, file: !1, line: 1, type: !11)  !16 = !DILocation(line: 1, column: 19, scope: !7)  !17 = !DILocation(line: 2, column: 26, scope: !7) +!18 = !DILocation(line: 3, column: 20, scope: !7) +!19 = distinct !DISubprogram(name: "foo1", linkageName: "_Z4foo1ddb", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !20) +!20 = !{!21, !22, !23} +!21 = !DILocalVariable(name: "x", arg: 1, scope: !19, file: !1, line: 1, type: !10) +!22 = !DILocalVariable(name: "y", arg: 2, scope: !19, file: !1, line: 1, type: !10) +!23 = !DILocalVariable(name: "c", arg: 3, scope: !19, file: !1, line: 1, type: !11) +!24 = !DILocation(line: 1, column: 19, scope: !19) +!25 = !DILocation(line: 2, column: 26, scope: !19) diff --git a/test/CodeGen/X86/machinesink-null-debuginfo.ll b/test/CodeGen/X86/machinesink-null-debuginfo.ll index 454e0cd704ff..c0399b3cfa81 100644 --- a/test/CodeGen/X86/machinesink-null-debuginfo.ll +++ b/test/CodeGen/X86/machinesink-null-debuginfo.ll @@ -11,10 +11,10 @@ define double @_Z3fooddb(double %x, double %y, i1 zeroext %c) local_unnamed_addr    tail call void @llvm.dbg.value(metadata double %y, metadata !14, metadata !DIExpression()), !dbg !17    tail call void @llvm.dbg.value(metadata i1 %c, metadata !15, metadata !DIExpression()), !dbg !18    %a = fdiv double %x, 3.000000e+00 -  %b = fdiv double %y, 5.000000e+00, !dbg !21 +  %b = fdiv double %y, 5.000000e+00, !dbg !19    %cond = select i1 %c,  double %a, double %b -; CHECK-NOT: debug-location !21 -  ret double %cond, !dbg !22 +; CHECK-NOT: debug-location !19 +  ret double %cond, !dbg !20  }  ; Function Attrs: nounwind readnone speculatable @@ -45,5 +45,5 @@ attributes #1 = { nounwind readnone speculatable }  !16 = !DILocation(line: 1, column: 19, scope: !7)  !17 = !DILocation(line: 1, column: 29, scope: !7)  !18 = !DILocation(line: 1, column: 37, scope: !7) -!21 = !DILocation(line: 2, column: 26, scope: !7) -!22 = !DILocation(line: 2, column: 3, scope: !7) +!19 = !DILocation(line: 2, column: 26, scope: !7) +!20 = !DILocation(line: 2, column: 3, scope: !7) diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll index 1eb2631e26ef..d318dde34434 100644 --- a/test/CodeGen/X86/masked_gather_scatter.ll +++ b/test/CodeGen/X86/masked_gather_scatter.ll @@ -1500,7 +1500,8 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {  ; KNL_32-NEXT:    vpsllq $32, %xmm0, %xmm0  ; KNL_32-NEXT:    vpsraq $32, %zmm0, %zmm0  ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,1,0] +; KNL_32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; KNL_32-NEXT:    vmovdqa %xmm2, %xmm2  ; KNL_32-NEXT:    vpsllq $63, %zmm2, %zmm2  ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1  ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} @@ -1596,7 +1597,8 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {  ; KNL_32-NEXT:    vpsllq $32, %xmm1, %xmm1  ; KNL_32-NEXT:    vpsraq $32, %zmm1, %zmm1  ; KNL_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL_32-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,1,0] +; KNL_32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2 +; KNL_32-NEXT:    vmovdqa %xmm2, %xmm2  ; KNL_32-NEXT:    vpsllq $63, %zmm2, %zmm2  ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1  ; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1} @@ -2606,56 +2608,32 @@ define <2 x float> @large_index(float* %base, <2 x i128> %ind, <2 x i1> %mask, <  define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {  ; KNL_64-LABEL: sext_i8_index:  ; KNL_64:       # %bb.0: -; KNL_64-NEXT:    vpmovsxbw %xmm0, %ymm0 -; KNL_64-NEXT:    vpmovsxwq %xmm0, %zmm1 -; KNL_64-NEXT:    vextracti128 $1, %ymm0, %xmm0 -; KNL_64-NEXT:    vpmovsxwq %xmm0, %zmm0 +; KNL_64-NEXT:    vpmovsxbd %xmm0, %zmm1  ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1 -; KNL_64-NEXT:    kxnorw %k0, %k0, %k2 -; KNL_64-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2} -; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} -; KNL_64-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; KNL_64-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}  ; KNL_64-NEXT:    retq  ;  ; KNL_32-LABEL: sext_i8_index:  ; KNL_32:       # %bb.0:  ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT:    vpmovsxbw %xmm0, %ymm0 -; KNL_32-NEXT:    vpmovsxwq %xmm0, %zmm1 -; KNL_32-NEXT:    vextracti128 $1, %ymm0, %xmm0 -; KNL_32-NEXT:    vpmovsxwq %xmm0, %zmm0 +; KNL_32-NEXT:    vpmovsxbd %xmm0, %zmm1  ; KNL_32-NEXT:    kxnorw %k0, %k0, %k1 -; KNL_32-NEXT:    kxnorw %k0, %k0, %k2 -; KNL_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k2} -; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} -; KNL_32-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; KNL_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}  ; KNL_32-NEXT:    retl  ;  ; SKX-LABEL: sext_i8_index:  ; SKX:       # %bb.0: -; SKX-NEXT:    vpmovsxbw %xmm0, %ymm0 -; SKX-NEXT:    vpmovsxwq %xmm0, %zmm1 -; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0 -; SKX-NEXT:    vpmovsxwq %xmm0, %zmm0 +; SKX-NEXT:    vpmovsxbd %xmm0, %zmm1  ; SKX-NEXT:    kxnorw %k0, %k0, %k1 -; SKX-NEXT:    kxnorw %k0, %k0, %k2 -; SKX-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k2} -; SKX-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} -; SKX-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; SKX-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1}  ; SKX-NEXT:    retq  ;  ; SKX_32-LABEL: sext_i8_index:  ; SKX_32:       # %bb.0:  ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT:    vpmovsxbw %xmm0, %ymm0 -; SKX_32-NEXT:    vpmovsxwq %xmm0, %zmm1 -; SKX_32-NEXT:    vextracti128 $1, %ymm0, %xmm0 -; SKX_32-NEXT:    vpmovsxwq %xmm0, %zmm0 +; SKX_32-NEXT:    vpmovsxbd %xmm0, %zmm1  ; SKX_32-NEXT:    kxnorw %k0, %k0, %k1 -; SKX_32-NEXT:    kxnorw %k0, %k0, %k2 -; SKX_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k2} -; SKX_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} -; SKX_32-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 +; SKX_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}  ; SKX_32-NEXT:    retl    %sext_ind = sext <16 x i8> %ind to <16 x i64> @@ -2669,40 +2647,42 @@ define <16 x float> @sext_i8_index(float* %base, <16 x i8> %ind) {  define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {  ; KNL_64-LABEL: sext_v8i8_index:  ; KNL_64:       # %bb.0: -; KNL_64-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; KNL_64-NEXT:    vpsllq $56, %zmm0, %zmm0 -; KNL_64-NEXT:    vpsraq $56, %zmm0, %zmm1 +; KNL_64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero  ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1 +; KNL_64-NEXT:    vpslld $24, %ymm0, %ymm0 +; KNL_64-NEXT:    vpsrad $24, %ymm0, %ymm0 +; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm1  ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}  ; KNL_64-NEXT:    retq  ;  ; KNL_32-LABEL: sext_v8i8_index:  ; KNL_32:       # %bb.0: -; KNL_32-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; KNL_32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero  ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT:    vpsllq $56, %zmm0, %zmm0 -; KNL_32-NEXT:    vpsraq $56, %zmm0, %zmm1 +; KNL_32-NEXT:    vpslld $24, %ymm0, %ymm0 +; KNL_32-NEXT:    vpsrad $24, %ymm0, %ymm0  ; KNL_32-NEXT:    kxnorw %k0, %k0, %k1 +; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1  ; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}  ; KNL_32-NEXT:    retl  ;  ; SKX-LABEL: sext_v8i8_index:  ; SKX:       # %bb.0: -; SKX-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; SKX-NEXT:    vpsllq $56, %zmm0, %zmm0 -; SKX-NEXT:    vpsraq $56, %zmm0, %zmm1 +; SKX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero  ; SKX-NEXT:    kxnorw %k0, %k0, %k1 -; SKX-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1} +; SKX-NEXT:    vpslld $24, %ymm0, %ymm0 +; SKX-NEXT:    vpsrad $24, %ymm0, %ymm1 +; SKX-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}  ; SKX-NEXT:    retq  ;  ; SKX_32-LABEL: sext_v8i8_index:  ; SKX_32:       # %bb.0: -; SKX_32-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; SKX_32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero  ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT:    vpsllq $56, %zmm0, %zmm0 -; SKX_32-NEXT:    vpsraq $56, %zmm0, %zmm1 +; SKX_32-NEXT:    vpslld $24, %ymm0, %ymm0 +; SKX_32-NEXT:    vpsrad $24, %ymm0, %ymm1  ; SKX_32-NEXT:    kxnorw %k0, %k0, %k1 -; SKX_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1} +; SKX_32-NEXT:    vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}  ; SKX_32-NEXT:    retl    %sext_ind = sext <8 x i8> %ind to <8 x i64> @@ -2764,3 +2744,48 @@ define void @test_scatter_2i32_index(<2 x double> %a1, double* %base, <2 x i32>  }  declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>) +define <16 x float> @zext_index(float* %base, <16 x i32> %ind) { +; KNL_64-LABEL: zext_index: +; KNL_64:       # %bb.0: +; KNL_64-NEXT:    vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 +; KNL_64-NEXT:    kxnorw %k0, %k0, %k1 +; KNL_64-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} +; KNL_64-NEXT:    retq +; +; KNL_32-LABEL: zext_index: +; KNL_32:       # %bb.0: +; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT:    vpandd {{\.LCPI.*}}{1to16}, %zmm0, %zmm1 +; KNL_32-NEXT:    kxnorw %k0, %k0, %k1 +; KNL_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} +; KNL_32-NEXT:    retl +; +; SKX_SMALL-LABEL: zext_index: +; SKX_SMALL:       # %bb.0: +; SKX_SMALL-NEXT:    vandps {{.*}}(%rip){1to16}, %zmm0, %zmm1 +; SKX_SMALL-NEXT:    kxnorw %k0, %k0, %k1 +; SKX_SMALL-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} +; SKX_SMALL-NEXT:    retq +; +; SKX_LARGE-LABEL: zext_index: +; SKX_LARGE:       # %bb.0: +; SKX_LARGE-NEXT:    movabsq ${{\.LCPI.*}}, %rax +; SKX_LARGE-NEXT:    vandps (%rax){1to16}, %zmm0, %zmm1 +; SKX_LARGE-NEXT:    kxnorw %k0, %k0, %k1 +; SKX_LARGE-NEXT:    vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} +; SKX_LARGE-NEXT:    retq +; +; SKX_32-LABEL: zext_index: +; SKX_32:       # %bb.0: +; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT:    vandps {{\.LCPI.*}}{1to16}, %zmm0, %zmm1 +; SKX_32-NEXT:    kxnorw %k0, %k0, %k1 +; SKX_32-NEXT:    vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} +; SKX_32-NEXT:    retl +  %ind_masked = and <16 x i32> %ind, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> +  %sext_ind = zext <16 x i32> %ind_masked to <16 x i64> +  %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind + +  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) +  ret <16 x float>%res +} diff --git a/test/CodeGen/X86/popcnt.ll b/test/CodeGen/X86/popcnt.ll index d7622c8d0cab..f7b9ea9b8b2e 100644 --- a/test/CodeGen/X86/popcnt.ll +++ b/test/CodeGen/X86/popcnt.ll @@ -78,7 +78,7 @@ define i16 @cnt16(i16 %x) nounwind readnone {  ; X32-NEXT:    movl %ecx, %eax  ; X32-NEXT:    shll $8, %eax  ; X32-NEXT:    addl %ecx, %eax -; X32-NEXT:    movzbl %ah, %eax +; X32-NEXT:    movzbl %ah, %eax # NOREX  ; X32-NEXT:    # kill: def %ax killed %ax killed %eax  ; X32-NEXT:    retl  ; diff --git a/test/CodeGen/X86/prefetch.ll b/test/CodeGen/X86/prefetch.ll index 17a9ac994a79..839948174a43 100644 --- a/test/CodeGen/X86/prefetch.ll +++ b/test/CodeGen/X86/prefetch.ll @@ -1,27 +1,101 @@ -; RUN: llc < %s -mtriple=i686-- -mattr=+sse | FileCheck %s -; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s -; RUN: llc < %s -mtriple=i686-- -mattr=+sse -mattr=+prfchw | FileCheck %s -check-prefix=PRFCHW -; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=SLM -; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=PRFCHW -; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=NOPRFCHW +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-- -mattr=+sse | FileCheck %s --check-prefix=SSE +; RUN: llc < %s -mtriple=i686-- -mattr=+avx | FileCheck %s --check-prefix=SSE +; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prfchw | FileCheck %s -check-prefix=PRFCHWSSE +; RUN: llc < %s -mtriple=i686-- -mattr=+prfchw | FileCheck %s -check-prefix=PRFCHWSSE +; RUN: llc < %s -mtriple=i686-- -mcpu=slm | FileCheck %s -check-prefix=PRFCHWSSE +; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 | FileCheck %s -check-prefix=PRFCHWSSE +; RUN: llc < %s -mtriple=i686-- -mcpu=btver2 -mattr=-prfchw | FileCheck %s -check-prefix=SSE +; RUN: llc < %s -mtriple=i686-- -mattr=+sse,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1 +; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1 +; RUN: llc < %s -mtriple=i686-- -mattr=-sse,+3dnow,+prefetchwt1 | FileCheck %s -check-prefix=PREFETCHWT1 +; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow | FileCheck %s -check-prefix=3DNOW +; RUN: llc < %s -mtriple=i686-- -mattr=+3dnow,+prfchw | FileCheck %s -check-prefix=PRFCHW3DNOW + +; Rules: +; 3dnow by itself get you just the single prefetch instruction with no hints +; sse provides prefetch0/1/2/nta +; supporting prefetchw, but not 3dnow implicitly provides prefetcht0/1/2/nta regardless of sse setting as we need something to fall back to for the non-write hint. +; supporting prefetchwt1 implies prefetcht0/1/2/nta and prefetchw regardless of other settings. this allows levels for non-write and gives us an instruction for write+T0 +; 3dnow prefetch instruction will only get used if you have no other prefetch instructions enabled  ; rdar://10538297  define void @t(i8* %ptr) nounwind  { +; SSE-LABEL: t: +; SSE:       # %bb.0: # %entry +; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax +; SSE-NEXT:    prefetcht2 (%eax) +; SSE-NEXT:    prefetcht1 (%eax) +; SSE-NEXT:    prefetcht0 (%eax) +; SSE-NEXT:    prefetchnta (%eax) +; SSE-NEXT:    prefetcht2 (%eax) +; SSE-NEXT:    prefetcht1 (%eax) +; SSE-NEXT:    prefetcht0 (%eax) +; SSE-NEXT:    prefetchnta (%eax) +; SSE-NEXT:    retl +; +; PRFCHWSSE-LABEL: t: +; PRFCHWSSE:       # %bb.0: # %entry +; PRFCHWSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax +; PRFCHWSSE-NEXT:    prefetcht2 (%eax) +; PRFCHWSSE-NEXT:    prefetcht1 (%eax) +; PRFCHWSSE-NEXT:    prefetcht0 (%eax) +; PRFCHWSSE-NEXT:    prefetchnta (%eax) +; PRFCHWSSE-NEXT:    prefetchw (%eax) +; PRFCHWSSE-NEXT:    prefetchw (%eax) +; PRFCHWSSE-NEXT:    prefetchw (%eax) +; PRFCHWSSE-NEXT:    prefetchw (%eax) +; PRFCHWSSE-NEXT:    retl +; +; PREFETCHWT1-LABEL: t: +; PREFETCHWT1:       # %bb.0: # %entry +; PREFETCHWT1-NEXT:    movl {{[0-9]+}}(%esp), %eax +; PREFETCHWT1-NEXT:    prefetcht2 (%eax) +; PREFETCHWT1-NEXT:    prefetcht1 (%eax) +; PREFETCHWT1-NEXT:    prefetcht0 (%eax) +; PREFETCHWT1-NEXT:    prefetchnta (%eax) +; PREFETCHWT1-NEXT:    prefetchwt1 (%eax) +; PREFETCHWT1-NEXT:    prefetchwt1 (%eax) +; PREFETCHWT1-NEXT:    prefetchw (%eax) +; PREFETCHWT1-NEXT:    prefetchwt1 (%eax) +; PREFETCHWT1-NEXT:    retl +; +; 3DNOW-LABEL: t: +; 3DNOW:       # %bb.0: # %entry +; 3DNOW-NEXT:    movl {{[0-9]+}}(%esp), %eax +; 3DNOW-NEXT:    prefetch (%eax) +; 3DNOW-NEXT:    prefetch (%eax) +; 3DNOW-NEXT:    prefetch (%eax) +; 3DNOW-NEXT:    prefetch (%eax) +; 3DNOW-NEXT:    prefetch (%eax) +; 3DNOW-NEXT:    prefetch (%eax) +; 3DNOW-NEXT:    prefetch (%eax) +; 3DNOW-NEXT:    prefetch (%eax) +; 3DNOW-NEXT:    retl +; +; PRFCHW3DNOW-LABEL: t: +; PRFCHW3DNOW:       # %bb.0: # %entry +; PRFCHW3DNOW-NEXT:    movl {{[0-9]+}}(%esp), %eax +; PRFCHW3DNOW-NEXT:    prefetch (%eax) +; PRFCHW3DNOW-NEXT:    prefetch (%eax) +; PRFCHW3DNOW-NEXT:    prefetch (%eax) +; PRFCHW3DNOW-NEXT:    prefetch (%eax) +; PRFCHW3DNOW-NEXT:    prefetchw (%eax) +; PRFCHW3DNOW-NEXT:    prefetchw (%eax) +; PRFCHW3DNOW-NEXT:    prefetchw (%eax) +; PRFCHW3DNOW-NEXT:    prefetchw (%eax) +; PRFCHW3DNOW-NEXT:    retl  entry: -; CHECK: prefetcht2 -; CHECK: prefetcht1 -; CHECK: prefetcht0 -; CHECK: prefetchnta -; PRFCHW: prefetchw -; NOPRFCHW-NOT: prefetchw -; SLM: prefetchw  	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 1, i32 1 )  	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 2, i32 1 )  	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3, i32 1 )  	tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 0, i32 1 ) +	tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 1, i32 1 ) +	tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 2, i32 1 )  	tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 3, i32 1 ) +	tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 0, i32 1 )  	ret void  } -declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind  +declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-128.ll b/test/CodeGen/X86/shuffle-strided-with-offset-128.ll index 0f1f818e250d..2df115e2f9e4 100644 --- a/test/CodeGen/X86/shuffle-strided-with-offset-128.ll +++ b/test/CodeGen/X86/shuffle-strided-with-offset-128.ll @@ -2,7 +2,8 @@  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW @@ -363,12 +364,26 @@ define void @shuffle_v8i16_to_v2i16_1(<8 x i16>* %L, <2 x i16>* %S) nounwind {  ; SSE-NEXT:    movd %xmm0, (%rsi)  ; SSE-NEXT:    retq  ; -; AVX-LABEL: shuffle_v8i16_to_v2i16_1: -; AVX:       # %bb.0: -; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; AVX-NEXT:    vmovd %xmm0, (%rsi) -; AVX-NEXT:    retq +; AVX1-LABEL: shuffle_v8i16_to_v2i16_1: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX1-NEXT:    vmovd %xmm0, (%rsi) +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_1: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi) +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_1: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi) +; AVX2-FAST-NEXT:    retq  ;  ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_1:  ; AVX512F:       # %bb.0: @@ -409,12 +424,26 @@ define void @shuffle_v8i16_to_v2i16_2(<8 x i16>* %L, <2 x i16>* %S) nounwind {  ; SSE-NEXT:    movd %xmm0, (%rsi)  ; SSE-NEXT:    retq  ; -; AVX-LABEL: shuffle_v8i16_to_v2i16_2: -; AVX:       # %bb.0: -; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX-NEXT:    vmovd %xmm0, (%rsi) -; AVX-NEXT:    retq +; AVX1-LABEL: shuffle_v8i16_to_v2i16_2: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] +; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; AVX1-NEXT:    vmovd %xmm0, (%rsi) +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_2: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi) +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_2: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi) +; AVX2-FAST-NEXT:    retq  ;  ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_2:  ; AVX512F:       # %bb.0: @@ -455,12 +484,26 @@ define void @shuffle_v8i16_to_v2i16_3(<8 x i16>* %L, <2 x i16>* %S) nounwind {  ; SSE-NEXT:    movd %xmm0, (%rsi)  ; SSE-NEXT:    retq  ; -; AVX-LABEL: shuffle_v8i16_to_v2i16_3: -; AVX:       # %bb.0: -; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX-NEXT:    vmovd %xmm0, (%rsi) -; AVX-NEXT:    retq +; AVX1-LABEL: shuffle_v8i16_to_v2i16_3: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] +; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX1-NEXT:    vmovd %xmm0, (%rsi) +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_3: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi) +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_3: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi) +; AVX2-FAST-NEXT:    retq  ;  ; AVX512F-LABEL: shuffle_v8i16_to_v2i16_3:  ; AVX512F:       # %bb.0: diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/test/CodeGen/X86/shuffle-strided-with-offset-256.ll index 7cef269ebc2b..081c962ab94a 100644 --- a/test/CodeGen/X86/shuffle-strided-with-offset-256.ll +++ b/test/CodeGen/X86/shuffle-strided-with-offset-256.ll @@ -1,6 +1,7 @@  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW @@ -362,18 +363,30 @@ define void @shuffle_v16i16_to_v4i16_1(<16 x i16>* %L, <4 x i16>* %S) nounwind {  ; AVX1-NEXT:    vzeroupper  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_to_v4i16_1: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vmovdqa (%rdi), %ymm0 -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] -; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT:    vmovq %xmm0, (%rsi) -; AVX2-NEXT:    vzeroupper -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi) +; AVX2-SLOW-NEXT:    vzeroupper +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi) +; AVX2-FAST-NEXT:    vzeroupper +; AVX2-FAST-NEXT:    retq  ;  ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:  ; AVX512F:       # %bb.0: @@ -446,18 +459,30 @@ define void @shuffle_v16i16_to_v4i16_2(<16 x i16>* %L, <4 x i16>* %S) nounwind {  ; AVX1-NEXT:    vzeroupper  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_to_v4i16_2: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vmovdqa (%rdi), %ymm0 -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] -; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT:    vmovq %xmm0, (%rsi) -; AVX2-NEXT:    vzeroupper -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi) +; AVX2-SLOW-NEXT:    vzeroupper +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi) +; AVX2-FAST-NEXT:    vzeroupper +; AVX2-FAST-NEXT:    retq  ;  ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:  ; AVX512F:       # %bb.0: @@ -522,18 +547,30 @@ define void @shuffle_v16i16_to_v4i16_3(<16 x i16>* %L, <4 x i16>* %S) nounwind {  ; AVX1-NEXT:    vzeroupper  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_to_v4i16_3: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vmovdqa (%rdi), %ymm0 -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT:    vmovq %xmm0, (%rsi) -; AVX2-NEXT:    vzeroupper -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi) +; AVX2-SLOW-NEXT:    vzeroupper +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi) +; AVX2-FAST-NEXT:    vzeroupper +; AVX2-FAST-NEXT:    retq  ;  ; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:  ; AVX512F:       # %bb.0: @@ -633,11 +670,9 @@ define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {  ; AVX512VL:       # %bb.0:  ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0  ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11] +; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]  ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT:    vpsrld $16, %xmm1, %xmm1  ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT:    vpsrld $16, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]  ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)  ; AVX512VL-NEXT:    vzeroupper @@ -659,11 +694,9 @@ define void @shuffle_v32i8_to_v4i8_1(<32 x i8>* %L, <4 x i8>* %S) nounwind {  ; AVX512BWVL:       # %bb.0:  ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0  ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11] +; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,255,255,9,9,255,255,9,9,255,255,11,11,255,255]  ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT:    vpsrld $16, %xmm1, %xmm1  ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm0  ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]  ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)  ; AVX512BWVL-NEXT:    vzeroupper @@ -795,11 +828,9 @@ define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {  ; AVX512VL:       # %bb.0:  ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0  ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]  ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]  ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]  ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]  ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)  ; AVX512VL-NEXT:    vzeroupper @@ -821,11 +852,9 @@ define void @shuffle_v32i8_to_v4i8_3(<32 x i8>* %L, <4 x i8>* %S) nounwind {  ; AVX512BWVL:       # %bb.0:  ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0  ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,3,11,11,11,11,3,3,8,8,9,9,10,10,11,11]  ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7]  ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7]  ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]  ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)  ; AVX512BWVL-NEXT:    vzeroupper @@ -949,14 +978,9 @@ define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {  ; AVX512VL:       # %bb.0:  ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0  ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7] +; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7] +; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]  ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)  ; AVX512VL-NEXT:    vzeroupper @@ -978,14 +1002,9 @@ define void @shuffle_v32i8_to_v4i8_5(<32 x i8>* %L, <4 x i8>* %S) nounwind {  ; AVX512BWVL:       # %bb.0:  ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0  ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7] +; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [5,5,13,13,13,13,5,5,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0  ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]  ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)  ; AVX512BWVL-NEXT:    vzeroupper diff --git a/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/test/CodeGen/X86/shuffle-strided-with-offset-512.ll index 7f3431fabedc..a5d0a7fa401b 100644 --- a/test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ b/test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -99,10 +99,9 @@ define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind  ; AVX512BWVL:       # %bb.0:  ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0  ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,17,19,21,23,9,11,13,15,25,27,29,31] +; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]  ; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3] -; AVX512BWVL-NEXT:    vmovdqa %ymm0, (%rsi) +; AVX512BWVL-NEXT:    vmovdqa %ymm2, (%rsi)  ; AVX512BWVL-NEXT:    vzeroupper  ; AVX512BWVL-NEXT:    retq    %vec = load <32 x i16>, <32 x i16>* %L @@ -673,17 +672,14 @@ define void @shuffle_v64i8_to_v8i8_1(<64 x i8>* %L, <8 x i8>* %S) nounwind {  ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0  ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1  ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11] +; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,0,1,1,1,1,9,9,8,8,9,9,10,10,11,11]  ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]  ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]  ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]  ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,9,9,8,8,9,9,8,8,9,9,10,10,11,11]  ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]  ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]  ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]  ; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]  ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi) @@ -831,17 +827,14 @@ define void @shuffle_v64i8_to_v8i8_3(<64 x i8>* %L, <8 x i8>* %S) nounwind {  ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0  ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1  ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,10,11,11,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,10,11,11,3,3,11,11,8,8,9,9,10,10,11,11]  ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]  ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]  ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]  ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,3,11,11,2,2,3,3,8,8,9,9,10,10,11,11]  ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]  ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]  ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]  ; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]  ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi) @@ -989,24 +982,14 @@ define void @shuffle_v64i8_to_v8i8_5(<64 x i8>* %L, <8 x i8>* %S) nounwind {  ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0  ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1  ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [12,12,13,13,5,5,13,13,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1  ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]  ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2 -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512BWVL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,5,13,13,4,4,5,5,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0  ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]  ; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]  ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi) @@ -1154,17 +1137,14 @@ define void @shuffle_v64i8_to_v8i8_7(<64 x i8>* %L, <8 x i8>* %S) nounwind {  ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0  ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1  ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [14,14,15,15,6,6,7,7,4,4,5,5,6,6,7,7] +; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [14,14,15,15,7,7,15,15,4,4,5,5,6,6,7,7]  ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]  ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]  ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]  ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm2 +; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,7,15,15,6,6,7,7,4,4,5,5,6,6,7,7]  ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]  ; AVX512BWVL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]  ; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]  ; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]  ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi) diff --git a/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/test/CodeGen/X86/shuffle-vs-trunc-128.ll index 1bfe37b1497e..f2cd81d3d937 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -2,7 +2,8 @@  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW @@ -436,12 +437,26 @@ define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {  ; SSE-NEXT:    movd %xmm0, (%rsi)  ; SSE-NEXT:    retq  ; -; AVX-LABEL: shuffle_v8i16_to_v2i16: -; AVX:       # %bb.0: -; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT:    vmovd %xmm0, (%rsi) -; AVX-NEXT:    retq +; AVX1-LABEL: shuffle_v8i16_to_v2i16: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT:    vmovd %xmm0, (%rsi) +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi) +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi) +; AVX2-FAST-NEXT:    retq  ;  ; AVX512F-LABEL: shuffle_v8i16_to_v2i16:  ; AVX512F:       # %bb.0: @@ -482,12 +497,26 @@ define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {  ; SSE-NEXT:    movd %xmm0, (%rsi)  ; SSE-NEXT:    retq  ; -; AVX-LABEL: trunc_v2i64_to_v2i16: -; AVX:       # %bb.0: -; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT:    vmovd %xmm0, (%rsi) -; AVX-NEXT:    retq +; AVX1-LABEL: trunc_v2i64_to_v2i16: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT:    vmovd %xmm0, (%rsi) +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i16: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi) +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: trunc_v2i64_to_v2i16: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi) +; AVX2-FAST-NEXT:    retq  ;  ; AVX512F-LABEL: trunc_v2i64_to_v2i16:  ; AVX512F:       # %bb.0: diff --git a/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/test/CodeGen/X86/shuffle-vs-trunc-256.ll index 59a8aa47246c..5b46c73a3f65 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -1,6 +1,7 @@  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW @@ -139,59 +140,17 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {  ; AVX2-NEXT:    vzeroupper  ; AVX2-NEXT:    retq  ; -; AVX512F-LABEL: shuffle_v16i16_to_v8i16: -; AVX512F:       # %bb.0: -; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT:    vzeroupper -; AVX512F-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v16i16_to_v8i16: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT:    vzeroupper -; AVX512VL-NEXT:    retq -; -; AVX512BW-LABEL: shuffle_v16i16_to_v8i16: -; AVX512BW:       # %bb.0: -; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT:    vzeroupper -; AVX512BW-NEXT:    retq -; -; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16: -; AVX512BWVL:       # %bb.0: -; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT:    vmovdqa %xmm0, (%rsi) -; AVX512BWVL-NEXT:    vzeroupper -; AVX512BWVL-NEXT:    retq +; AVX512-LABEL: shuffle_v16i16_to_v8i16: +; AVX512:       # %bb.0: +; AVX512-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT:    vmovdqa %xmm0, (%rsi) +; AVX512-NEXT:    vzeroupper +; AVX512-NEXT:    retq    %vec = load <16 x i16>, <16 x i16>* %L    %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>    store <8 x i16> %strided.vec, <8 x i16>* %S @@ -290,13 +249,21 @@ define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {  ; AVX1-NEXT:    vzeroupper  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: trunc_v4i64_to_v4i32: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT:    vmovaps %xmm0, (%rsi) -; AVX2-NEXT:    vzeroupper -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT:    vmovaps %xmm0, (%rsi) +; AVX2-SLOW-NEXT:    vzeroupper +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT:    vpermps (%rdi), %ymm0, %ymm0 +; AVX2-FAST-NEXT:    vmovaps %xmm0, (%rsi) +; AVX2-FAST-NEXT:    vzeroupper +; AVX2-FAST-NEXT:    retq  ;  ; AVX512F-LABEL: trunc_v4i64_to_v4i32:  ; AVX512F:       # %bb.0: @@ -399,12 +366,9 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {  ; AVX512BWVL:       # %bb.0:  ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0  ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0  ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]  ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)  ; AVX512BWVL-NEXT:    vzeroupper @@ -490,18 +454,30 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {  ; AVX1-NEXT:    vzeroupper  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_to_v4i16: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vmovdqa (%rdi), %ymm0 -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT:    vmovq %xmm0, (%rsi) -; AVX2-NEXT:    vzeroupper -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi) +; AVX2-SLOW-NEXT:    vzeroupper +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi) +; AVX2-FAST-NEXT:    vzeroupper +; AVX2-FAST-NEXT:    retq  ;  ; AVX512F-LABEL: shuffle_v16i16_to_v4i16:  ; AVX512F:       # %bb.0: @@ -563,14 +539,23 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {  ; AVX1-NEXT:    vzeroupper  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: trunc_v4i64_to_v4i16: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-NEXT:    vmovq %xmm0, (%rsi) -; AVX2-NEXT:    vzeroupper -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi) +; AVX2-SLOW-NEXT:    vzeroupper +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT:    vpermd (%rdi), %ymm0, %ymm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi) +; AVX2-FAST-NEXT:    vzeroupper +; AVX2-FAST-NEXT:    retq  ;  ; AVX512F-LABEL: trunc_v4i64_to_v4i16:  ; AVX512F:       # %bb.0: @@ -693,14 +678,23 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {  ; AVX1-NEXT:    vzeroupper  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: trunc_v4i64_to_v4i8: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] -; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-NEXT:    vmovd %xmm0, (%rsi) -; AVX2-NEXT:    vzeroupper -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i8: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = mem[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi) +; AVX2-SLOW-NEXT:    vzeroupper +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: trunc_v4i64_to_v4i8: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT:    vpermd (%rdi), %ymm0, %ymm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi) +; AVX2-FAST-NEXT:    vzeroupper +; AVX2-FAST-NEXT:    retq  ;  ; AVX512F-LABEL: trunc_v4i64_to_v4i8:  ; AVX512F:       # %bb.0: diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll index 3fa148405f6b..4577cd8f8b44 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -148,10 +148,9 @@ define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {  ; AVX512BWVL:       # %bb.0:  ; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0  ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,16,18,20,22,8,10,12,14,24,26,28,30] +; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]  ; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3] -; AVX512BWVL-NEXT:    vmovdqa %ymm0, (%rsi) +; AVX512BWVL-NEXT:    vmovdqa %ymm2, (%rsi)  ; AVX512BWVL-NEXT:    vzeroupper  ; AVX512BWVL-NEXT:    retq    %vec = load <32 x i16>, <32 x i16>* %L diff --git a/test/CodeGen/X86/var-permute-256.ll b/test/CodeGen/X86/var-permute-256.ll index 82a790298f23..3baab2476d40 100644 --- a/test/CodeGen/X86/var-permute-256.ll +++ b/test/CodeGen/X86/var-permute-256.ll @@ -142,14 +142,13 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {  ; AVX1-NEXT:    andl $7, %r10d  ; AVX1-NEXT:    andl $28, %edi  ; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT:    movq %rsp, %rax -; AVX1-NEXT:    vpinsrd $1, (%rdx,%rax), %xmm0, %xmm0 +; AVX1-NEXT:    vpinsrd $1, (%rsp,%rdx), %xmm0, %xmm0  ; AVX1-NEXT:    vpinsrd $2, (%rsp,%r10,4), %xmm0, %xmm0 -; AVX1-NEXT:    vpinsrd $3, (%rdi,%rax), %xmm0, %xmm0 +; AVX1-NEXT:    vpinsrd $3, (%rsp,%rdi), %xmm0, %xmm0  ; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT:    vpinsrd $1, (%rsi,%rax), %xmm1, %xmm1 +; AVX1-NEXT:    vpinsrd $1, (%rsp,%rsi), %xmm1, %xmm1  ; AVX1-NEXT:    vpinsrd $2, (%rsp,%r8,4), %xmm1, %xmm1 -; AVX1-NEXT:    vpinsrd $3, (%rcx,%rax), %xmm1, %xmm1 +; AVX1-NEXT:    vpinsrd $3, (%rsp,%rcx), %xmm1, %xmm1  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0  ; AVX1-NEXT:    movq %rbp, %rsp  ; AVX1-NEXT:    popq %rbp @@ -505,118 +504,117 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {  ; AVX1-NEXT:    vpextrb $0, %xmm2, %eax  ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movq %rsp, %rcx -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vmovd %eax, %xmm0  ; AVX1-NEXT:    vpextrb $1, %xmm2, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0  ; AVX1-NEXT:    vpextrb $2, %xmm2, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0  ; AVX1-NEXT:    vpextrb $3, %xmm2, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0  ; AVX1-NEXT:    vpextrb $4, %xmm2, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0  ; AVX1-NEXT:    vpextrb $5, %xmm2, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0  ; AVX1-NEXT:    vpextrb $6, %xmm2, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0  ; AVX1-NEXT:    vpextrb $7, %xmm2, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0  ; AVX1-NEXT:    vpextrb $8, %xmm2, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0  ; AVX1-NEXT:    vpextrb $9, %xmm2, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0  ; AVX1-NEXT:    vpextrb $10, %xmm2, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0  ; AVX1-NEXT:    vpextrb $11, %xmm2, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0  ; AVX1-NEXT:    vpextrb $12, %xmm2, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0  ; AVX1-NEXT:    vpextrb $13, %xmm2, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0  ; AVX1-NEXT:    vpextrb $14, %xmm2, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0  ; AVX1-NEXT:    vpextrb $15, %xmm2, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0  ; AVX1-NEXT:    vpextrb $0, %xmm1, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vmovd %eax, %xmm2  ; AVX1-NEXT:    vpextrb $1, %xmm1, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT:    vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2  ; AVX1-NEXT:    vpextrb $2, %xmm1, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT:    vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2  ; AVX1-NEXT:    vpextrb $3, %xmm1, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT:    vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2  ; AVX1-NEXT:    vpextrb $4, %xmm1, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT:    vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2  ; AVX1-NEXT:    vpextrb $5, %xmm1, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT:    vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2  ; AVX1-NEXT:    vpextrb $6, %xmm1, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT:    vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2  ; AVX1-NEXT:    vpextrb $7, %xmm1, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT:    vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2  ; AVX1-NEXT:    vpextrb $8, %xmm1, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT:    vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2  ; AVX1-NEXT:    vpextrb $9, %xmm1, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT:    vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2  ; AVX1-NEXT:    vpextrb $10, %xmm1, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT:    vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2  ; AVX1-NEXT:    vpextrb $11, %xmm1, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT:    vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2  ; AVX1-NEXT:    vpextrb $12, %xmm1, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT:    vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2  ; AVX1-NEXT:    vpextrb $13, %xmm1, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT:    vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2  ; AVX1-NEXT:    vpextrb $14, %xmm1, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2 +; AVX1-NEXT:    vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2  ; AVX1-NEXT:    vpextrb $15, %xmm1, %eax  ; AVX1-NEXT:    andl $31, %eax -; AVX1-NEXT:    movzbl (%rax,%rcx), %eax +; AVX1-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX1-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0  ; AVX1-NEXT:    movq %rbp, %rsp @@ -633,118 +631,117 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {  ; AVX2-NEXT:    vpextrb $0, %xmm2, %eax  ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movq %rsp, %rcx -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vmovd %eax, %xmm0  ; AVX2-NEXT:    vpextrb $1, %xmm2, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $2, %xmm2, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $3, %xmm2, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $4, %xmm2, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $5, %xmm2, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $6, %xmm2, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $7, %xmm2, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $8, %xmm2, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $9, %xmm2, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $10, %xmm2, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $11, %xmm2, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $12, %xmm2, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $13, %xmm2, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $14, %xmm2, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $15, %xmm2, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0  ; AVX2-NEXT:    vpextrb $0, %xmm1, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vmovd %eax, %xmm2  ; AVX2-NEXT:    vpextrb $1, %xmm1, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT:    vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2  ; AVX2-NEXT:    vpextrb $2, %xmm1, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT:    vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2  ; AVX2-NEXT:    vpextrb $3, %xmm1, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT:    vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2  ; AVX2-NEXT:    vpextrb $4, %xmm1, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT:    vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2  ; AVX2-NEXT:    vpextrb $5, %xmm1, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT:    vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2  ; AVX2-NEXT:    vpextrb $6, %xmm1, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT:    vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2  ; AVX2-NEXT:    vpextrb $7, %xmm1, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT:    vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2  ; AVX2-NEXT:    vpextrb $8, %xmm1, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT:    vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2  ; AVX2-NEXT:    vpextrb $9, %xmm1, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT:    vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2  ; AVX2-NEXT:    vpextrb $10, %xmm1, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT:    vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2  ; AVX2-NEXT:    vpextrb $11, %xmm1, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT:    vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2  ; AVX2-NEXT:    vpextrb $12, %xmm1, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT:    vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2  ; AVX2-NEXT:    vpextrb $13, %xmm1, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT:    vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2  ; AVX2-NEXT:    vpextrb $14, %xmm1, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2 +; AVX2-NEXT:    vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2  ; AVX2-NEXT:    vpextrb $15, %xmm1, %eax  ; AVX2-NEXT:    andl $31, %eax -; AVX2-NEXT:    movzbl (%rax,%rcx), %eax +; AVX2-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1  ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0  ; AVX2-NEXT:    movq %rbp, %rsp @@ -761,118 +758,117 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {  ; AVX512F-NEXT:    vpextrb $0, %xmm2, %eax  ; AVX512F-NEXT:    vmovaps %ymm0, (%rsp)  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movq %rsp, %rcx -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vmovd %eax, %xmm0  ; AVX512F-NEXT:    vpextrb $1, %xmm2, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0  ; AVX512F-NEXT:    vpextrb $2, %xmm2, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0  ; AVX512F-NEXT:    vpextrb $3, %xmm2, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0  ; AVX512F-NEXT:    vpextrb $4, %xmm2, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0  ; AVX512F-NEXT:    vpextrb $5, %xmm2, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0  ; AVX512F-NEXT:    vpextrb $6, %xmm2, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0  ; AVX512F-NEXT:    vpextrb $7, %xmm2, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0  ; AVX512F-NEXT:    vpextrb $8, %xmm2, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0  ; AVX512F-NEXT:    vpextrb $9, %xmm2, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0  ; AVX512F-NEXT:    vpextrb $10, %xmm2, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0  ; AVX512F-NEXT:    vpextrb $11, %xmm2, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0  ; AVX512F-NEXT:    vpextrb $12, %xmm2, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0  ; AVX512F-NEXT:    vpextrb $13, %xmm2, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0  ; AVX512F-NEXT:    vpextrb $14, %xmm2, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0  ; AVX512F-NEXT:    vpextrb $15, %xmm2, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0  ; AVX512F-NEXT:    vpextrb $0, %xmm1, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vmovd %eax, %xmm2  ; AVX512F-NEXT:    vpextrb $1, %xmm1, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT:    vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2  ; AVX512F-NEXT:    vpextrb $2, %xmm1, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT:    vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2  ; AVX512F-NEXT:    vpextrb $3, %xmm1, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT:    vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2  ; AVX512F-NEXT:    vpextrb $4, %xmm1, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT:    vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2  ; AVX512F-NEXT:    vpextrb $5, %xmm1, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT:    vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2  ; AVX512F-NEXT:    vpextrb $6, %xmm1, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT:    vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2  ; AVX512F-NEXT:    vpextrb $7, %xmm1, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT:    vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2  ; AVX512F-NEXT:    vpextrb $8, %xmm1, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT:    vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2  ; AVX512F-NEXT:    vpextrb $9, %xmm1, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT:    vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2  ; AVX512F-NEXT:    vpextrb $10, %xmm1, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT:    vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2  ; AVX512F-NEXT:    vpextrb $11, %xmm1, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT:    vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2  ; AVX512F-NEXT:    vpextrb $12, %xmm1, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT:    vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2  ; AVX512F-NEXT:    vpextrb $13, %xmm1, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT:    vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2  ; AVX512F-NEXT:    vpextrb $14, %xmm1, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2 +; AVX512F-NEXT:    vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2  ; AVX512F-NEXT:    vpextrb $15, %xmm1, %eax  ; AVX512F-NEXT:    andl $31, %eax -; AVX512F-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512F-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512F-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1  ; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0  ; AVX512F-NEXT:    movq %rbp, %rsp @@ -889,118 +885,117 @@ define <32 x i8> @var_shuffle_v32i8(<32 x i8> %v, <32 x i8> %indices) nounwind {  ; AVX512VL-NEXT:    vpextrb $0, %xmm2, %eax  ; AVX512VL-NEXT:    vmovaps %ymm0, (%rsp)  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movq %rsp, %rcx -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vmovd %eax, %xmm0  ; AVX512VL-NEXT:    vpextrb $1, %xmm2, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpextrb $2, %xmm2, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpextrb $3, %xmm2, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpextrb $4, %xmm2, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpextrb $5, %xmm2, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpextrb $6, %xmm2, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpextrb $7, %xmm2, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpextrb $8, %xmm2, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpextrb $9, %xmm2, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpextrb $10, %xmm2, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpextrb $11, %xmm2, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpextrb $12, %xmm2, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpextrb $13, %xmm2, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpextrb $14, %xmm2, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpextrb $15, %xmm2, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpextrb $0, %xmm1, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vmovd %eax, %xmm2  ; AVX512VL-NEXT:    vpextrb $1, %xmm1, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    vpinsrb $1, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT:    vpinsrb $1, (%rsp,%rax), %xmm2, %xmm2  ; AVX512VL-NEXT:    vpextrb $2, %xmm1, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT:    vpinsrb $2, (%rsp,%rax), %xmm2, %xmm2  ; AVX512VL-NEXT:    vpextrb $3, %xmm1, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT:    vpinsrb $3, (%rsp,%rax), %xmm2, %xmm2  ; AVX512VL-NEXT:    vpextrb $4, %xmm1, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT:    vpinsrb $4, (%rsp,%rax), %xmm2, %xmm2  ; AVX512VL-NEXT:    vpextrb $5, %xmm1, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT:    vpinsrb $5, (%rsp,%rax), %xmm2, %xmm2  ; AVX512VL-NEXT:    vpextrb $6, %xmm1, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT:    vpinsrb $6, (%rsp,%rax), %xmm2, %xmm2  ; AVX512VL-NEXT:    vpextrb $7, %xmm1, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT:    vpinsrb $7, (%rsp,%rax), %xmm2, %xmm2  ; AVX512VL-NEXT:    vpextrb $8, %xmm1, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT:    vpinsrb $8, (%rsp,%rax), %xmm2, %xmm2  ; AVX512VL-NEXT:    vpextrb $9, %xmm1, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT:    vpinsrb $9, (%rsp,%rax), %xmm2, %xmm2  ; AVX512VL-NEXT:    vpextrb $10, %xmm1, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT:    vpinsrb $10, (%rsp,%rax), %xmm2, %xmm2  ; AVX512VL-NEXT:    vpextrb $11, %xmm1, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT:    vpinsrb $11, (%rsp,%rax), %xmm2, %xmm2  ; AVX512VL-NEXT:    vpextrb $12, %xmm1, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT:    vpinsrb $12, (%rsp,%rax), %xmm2, %xmm2  ; AVX512VL-NEXT:    vpextrb $13, %xmm1, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT:    vpinsrb $13, (%rsp,%rax), %xmm2, %xmm2  ; AVX512VL-NEXT:    vpextrb $14, %xmm1, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm2, %xmm2 +; AVX512VL-NEXT:    vpinsrb $14, (%rsp,%rax), %xmm2, %xmm2  ; AVX512VL-NEXT:    vpextrb $15, %xmm1, %eax  ; AVX512VL-NEXT:    andl $31, %eax -; AVX512VL-NEXT:    movzbl (%rax,%rcx), %eax +; AVX512VL-NEXT:    movzbl (%rsp,%rax), %eax  ; AVX512VL-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1  ; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0  ; AVX512VL-NEXT:    movq %rbp, %rsp @@ -1240,7 +1235,6 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi  ; AVX1-NEXT:    andl $7, %r10d  ; AVX1-NEXT:    andl $28, %edi  ; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT:    movq %rsp, %rax  ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]  ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]  ; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] diff --git a/test/CodeGen/X86/var-permute-512.ll b/test/CodeGen/X86/var-permute-512.ll index a5aa73cdf1a2..3f9f96b008c5 100644 --- a/test/CodeGen/X86/var-permute-512.ll +++ b/test/CodeGen/X86/var-permute-512.ll @@ -511,265 +511,201 @@ define <64 x i8> @var_shuffle_v64i8(<64 x i8> %v, <64 x i8> %indices) nounwind {  ; NOBW-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)  ; NOBW-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)  ; NOBW-NEXT:    vmovaps %ymm0, (%rsp) -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    movzbl (%rax,%rcx), %eax -; NOBW-NEXT:    vpextrb $1, %xmm4, %ecx -; NOBW-NEXT:    andl $63, %ecx +; NOBW-NEXT:    movzbl 3008(%rsp,%rax), %eax  ; NOBW-NEXT:    vmovd %eax, %xmm0 -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rax -; NOBW-NEXT:    vpinsrb $1, (%rcx,%rax), %xmm0, %xmm0 +; NOBW-NEXT:    vpextrb $1, %xmm4, %eax +; NOBW-NEXT:    andl $63, %eax +; NOBW-NEXT:    vpinsrb $1, 2944(%rsp,%rax), %xmm0, %xmm0  ; NOBW-NEXT:    vpextrb $2, %xmm4, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT:    vpinsrb $2, 2880(%rsp,%rax), %xmm0, %xmm0  ; NOBW-NEXT:    vpextrb $3, %xmm4, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT:    vpinsrb $3, 2816(%rsp,%rax), %xmm0, %xmm0  ; NOBW-NEXT:    vpextrb $4, %xmm4, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT:    vpinsrb $4, 2752(%rsp,%rax), %xmm0, %xmm0  ; NOBW-NEXT:    vpextrb $5, %xmm4, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT:    vpinsrb $5, 2688(%rsp,%rax), %xmm0, %xmm0  ; NOBW-NEXT:    vpextrb $6, %xmm4, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT:    vpinsrb $6, 2624(%rsp,%rax), %xmm0, %xmm0  ; NOBW-NEXT:    vpextrb $7, %xmm4, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT:    vpinsrb $7, 2560(%rsp,%rax), %xmm0, %xmm0  ; NOBW-NEXT:    vpextrb $8, %xmm4, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT:    vpinsrb $8, 2496(%rsp,%rax), %xmm0, %xmm0  ; NOBW-NEXT:    vpextrb $9, %xmm4, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT:    vpinsrb $9, 2432(%rsp,%rax), %xmm0, %xmm0  ; NOBW-NEXT:    vpextrb $10, %xmm4, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT:    vpinsrb $10, 2368(%rsp,%rax), %xmm0, %xmm0  ; NOBW-NEXT:    vpextrb $11, %xmm4, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT:    vpinsrb $11, 2304(%rsp,%rax), %xmm0, %xmm0  ; NOBW-NEXT:    vpextrb $12, %xmm4, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT:    vpinsrb $12, 2240(%rsp,%rax), %xmm0, %xmm0  ; NOBW-NEXT:    vpextrb $13, %xmm4, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT:    vpinsrb $13, 2176(%rsp,%rax), %xmm0, %xmm0  ; NOBW-NEXT:    vpextrb $14, %xmm4, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT:    vpinsrb $14, 2112(%rsp,%rax), %xmm0, %xmm0  ; NOBW-NEXT:    vpextrb $15, %xmm4, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $15, (%rax,%rcx), %xmm0, %xmm0 +; NOBW-NEXT:    vpinsrb $15, 2048(%rsp,%rax), %xmm0, %xmm0  ; NOBW-NEXT:    vpextrb $0, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    movzbl (%rax,%rcx), %eax -; NOBW-NEXT:    vpextrb $1, %xmm2, %ecx -; NOBW-NEXT:    andl $63, %ecx +; NOBW-NEXT:    movzbl 4032(%rsp,%rax), %eax  ; NOBW-NEXT:    vmovd %eax, %xmm1 -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rax -; NOBW-NEXT:    vpinsrb $1, (%rcx,%rax), %xmm1, %xmm1 +; NOBW-NEXT:    vpextrb $1, %xmm2, %eax +; NOBW-NEXT:    andl $63, %eax +; NOBW-NEXT:    vpinsrb $1, 3968(%rsp,%rax), %xmm1, %xmm1  ; NOBW-NEXT:    vpextrb $2, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT:    vpinsrb $2, 3904(%rsp,%rax), %xmm1, %xmm1  ; NOBW-NEXT:    vpextrb $3, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT:    vpinsrb $3, 3840(%rsp,%rax), %xmm1, %xmm1  ; NOBW-NEXT:    vpextrb $4, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT:    vpinsrb $4, 3776(%rsp,%rax), %xmm1, %xmm1  ; NOBW-NEXT:    vpextrb $5, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT:    vpinsrb $5, 3712(%rsp,%rax), %xmm1, %xmm1  ; NOBW-NEXT:    vpextrb $6, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT:    vpinsrb $6, 3648(%rsp,%rax), %xmm1, %xmm1  ; NOBW-NEXT:    vpextrb $7, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT:    vpinsrb $7, 3584(%rsp,%rax), %xmm1, %xmm1  ; NOBW-NEXT:    vpextrb $8, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT:    vpinsrb $8, 3520(%rsp,%rax), %xmm1, %xmm1  ; NOBW-NEXT:    vpextrb $9, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT:    vpinsrb $9, 3456(%rsp,%rax), %xmm1, %xmm1  ; NOBW-NEXT:    vpextrb $10, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT:    vpinsrb $10, 3392(%rsp,%rax), %xmm1, %xmm1  ; NOBW-NEXT:    vpextrb $11, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT:    vpinsrb $11, 3328(%rsp,%rax), %xmm1, %xmm1  ; NOBW-NEXT:    vpextrb $12, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT:    vpinsrb $12, 3264(%rsp,%rax), %xmm1, %xmm1  ; NOBW-NEXT:    vpextrb $13, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT:    vpinsrb $13, 3200(%rsp,%rax), %xmm1, %xmm1  ; NOBW-NEXT:    vpextrb $14, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT:    vpinsrb $14, 3136(%rsp,%rax), %xmm1, %xmm1  ; NOBW-NEXT:    vpextrb $15, %xmm2, %eax  ; NOBW-NEXT:    vextracti128 $1, %ymm3, %xmm2  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $15, (%rax,%rcx), %xmm1, %xmm1 +; NOBW-NEXT:    vpinsrb $15, 3072(%rsp,%rax), %xmm1, %xmm1  ; NOBW-NEXT:    vpextrb $0, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    movzbl (%rax,%rcx), %eax -; NOBW-NEXT:    vpextrb $1, %xmm2, %ecx -; NOBW-NEXT:    andl $63, %ecx +; NOBW-NEXT:    movzbl 960(%rsp,%rax), %eax  ; NOBW-NEXT:    vmovd %eax, %xmm4 -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rax -; NOBW-NEXT:    vpinsrb $1, (%rcx,%rax), %xmm4, %xmm4 +; NOBW-NEXT:    vpextrb $1, %xmm2, %eax +; NOBW-NEXT:    andl $63, %eax +; NOBW-NEXT:    vpinsrb $1, 896(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $2, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $2, 832(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $3, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $3, 768(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $4, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $4, 704(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $5, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $5, 640(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $6, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $6, 576(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $7, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $7, 512(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $8, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $8, 448(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $9, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $9, 384(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $10, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $10, 320(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $11, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $11, 256(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $12, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $12, 192(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $13, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $13, 128(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $14, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $14, 64(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $15, %xmm2, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    movq %rsp, %rcx -; NOBW-NEXT:    vpinsrb $15, (%rax,%rcx), %xmm4, %xmm2 +; NOBW-NEXT:    vpinsrb $15, (%rsp,%rax), %xmm4, %xmm2  ; NOBW-NEXT:    vpextrb $0, %xmm3, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    movzbl (%rax,%rcx), %eax -; NOBW-NEXT:    vpextrb $1, %xmm3, %ecx -; NOBW-NEXT:    andl $63, %ecx +; NOBW-NEXT:    movzbl 1984(%rsp,%rax), %eax  ; NOBW-NEXT:    vmovd %eax, %xmm4 -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rax -; NOBW-NEXT:    vpinsrb $1, (%rcx,%rax), %xmm4, %xmm4 +; NOBW-NEXT:    vpextrb $1, %xmm3, %eax +; NOBW-NEXT:    andl $63, %eax +; NOBW-NEXT:    vpinsrb $1, 1920(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $2, %xmm3, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $2, 1856(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $3, %xmm3, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $3, 1792(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $4, %xmm3, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $4, 1728(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $5, %xmm3, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $5, 1664(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $6, %xmm3, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $6, 1600(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $7, %xmm3, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $7, 1536(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $8, %xmm3, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $8, 1472(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $9, %xmm3, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $9, 1408(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $10, %xmm3, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $10, 1344(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $11, %xmm3, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $11, 1280(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $12, %xmm3, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $12, 1216(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $13, %xmm3, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $13, 1152(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $14, %xmm3, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm4, %xmm4 +; NOBW-NEXT:    vpinsrb $14, 1088(%rsp,%rax), %xmm4, %xmm4  ; NOBW-NEXT:    vpextrb $15, %xmm3, %eax  ; NOBW-NEXT:    andl $63, %eax -; NOBW-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx -; NOBW-NEXT:    vpinsrb $15, (%rax,%rcx), %xmm4, %xmm3 +; NOBW-NEXT:    vpinsrb $15, 1024(%rsp,%rax), %xmm4, %xmm3  ; NOBW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0  ; NOBW-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm1  ; NOBW-NEXT:    movq %rbp, %rsp diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll index 12530adf15cb..2178eb70cdec 100644 --- a/test/CodeGen/X86/vector-compare-results.ll +++ b/test/CodeGen/X86/vector-compare-results.ll @@ -2384,11 +2384,10 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {  ; AVX512F-LABEL: test_cmp_v32f32:  ; AVX512F:       # %bb.0:  ; AVX512F-NEXT:    vcmpltps %zmm0, %zmm2, %k1 -; AVX512F-NEXT:    movl {{.*}}(%rip), %eax -; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0  ; AVX512F-NEXT:    vcmpltps %zmm1, %zmm3, %k1 -; AVX512F-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}  ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1  ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0  ; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0 @@ -2399,12 +2398,11 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {  ;  ; AVX512DQ-LABEL: test_cmp_v32f32:  ; AVX512DQ:       # %bb.0: -; AVX512DQ-NEXT:    vcmpltps %zmm0, %zmm2, %k1 -; AVX512DQ-NEXT:    movl {{.*}}(%rip), %eax -; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512DQ-NEXT:    vcmpltps %zmm0, %zmm2, %k0 +; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0  ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT:    vcmpltps %zmm1, %zmm3, %k1 -; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512DQ-NEXT:    vcmpltps %zmm1, %zmm3, %k0 +; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1  ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1  ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0  ; AVX512DQ-NEXT:    vpsllw $7, %ymm0, %ymm0 @@ -2893,11 +2891,10 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {  ; AVX512F-LABEL: test_cmp_v32i32:  ; AVX512F:       # %bb.0:  ; AVX512F-NEXT:    vpcmpgtd %zmm2, %zmm0, %k1 -; AVX512F-NEXT:    movl {{.*}}(%rip), %eax -; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0  ; AVX512F-NEXT:    vpcmpgtd %zmm3, %zmm1, %k1 -; AVX512F-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}  ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1  ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0  ; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0 @@ -2908,12 +2905,11 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {  ;  ; AVX512DQ-LABEL: test_cmp_v32i32:  ; AVX512DQ:       # %bb.0: -; AVX512DQ-NEXT:    vpcmpgtd %zmm2, %zmm0, %k1 -; AVX512DQ-NEXT:    movl {{.*}}(%rip), %eax -; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512DQ-NEXT:    vpcmpgtd %zmm2, %zmm0, %k0 +; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0  ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT:    vpcmpgtd %zmm3, %zmm1, %k1 -; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512DQ-NEXT:    vpcmpgtd %zmm3, %zmm1, %k0 +; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1  ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1  ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0  ; AVX512DQ-NEXT:    vpsllw $7, %ymm0, %ymm0 @@ -5965,13 +5961,12 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind  ; AVX512F-NEXT:    vcmpltpd %zmm0, %zmm4, %k0  ; AVX512F-NEXT:    vcmpltpd %zmm1, %zmm5, %k1  ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1 -; AVX512F-NEXT:    movl {{.*}}(%rip), %eax -; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0  ; AVX512F-NEXT:    vcmpltpd %zmm2, %zmm6, %k0  ; AVX512F-NEXT:    vcmpltpd %zmm3, %zmm7, %k1  ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1 -; AVX512F-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}  ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1  ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0  ; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0 @@ -5984,14 +5979,13 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind  ; AVX512DQ:       # %bb.0:  ; AVX512DQ-NEXT:    vcmpltpd %zmm0, %zmm4, %k0  ; AVX512DQ-NEXT:    vcmpltpd %zmm1, %zmm5, %k1 -; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k1 -; AVX512DQ-NEXT:    movl {{.*}}(%rip), %eax -; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k0 +; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0  ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0  ; AVX512DQ-NEXT:    vcmpltpd %zmm2, %zmm6, %k0  ; AVX512DQ-NEXT:    vcmpltpd %zmm3, %zmm7, %k1 -; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k1 -; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k0 +; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1  ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1  ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0  ; AVX512DQ-NEXT:    vpsllw $7, %ymm0, %ymm0 @@ -6588,13 +6582,12 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {  ; AVX512F-NEXT:    vpcmpgtq %zmm4, %zmm0, %k0  ; AVX512F-NEXT:    vpcmpgtq %zmm5, %zmm1, %k1  ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1 -; AVX512F-NEXT:    movl {{.*}}(%rip), %eax -; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}  ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0  ; AVX512F-NEXT:    vpcmpgtq %zmm6, %zmm2, %k0  ; AVX512F-NEXT:    vpcmpgtq %zmm7, %zmm3, %k1  ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1 -; AVX512F-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}  ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1  ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0  ; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0 @@ -6607,14 +6600,13 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {  ; AVX512DQ:       # %bb.0:  ; AVX512DQ-NEXT:    vpcmpgtq %zmm4, %zmm0, %k0  ; AVX512DQ-NEXT:    vpcmpgtq %zmm5, %zmm1, %k1 -; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k1 -; AVX512DQ-NEXT:    movl {{.*}}(%rip), %eax -; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z} +; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k0 +; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0  ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0  ; AVX512DQ-NEXT:    vpcmpgtq %zmm6, %zmm2, %k0  ; AVX512DQ-NEXT:    vpcmpgtq %zmm7, %zmm3, %k1 -; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k1 -; AVX512DQ-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z} +; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k0 +; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1  ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1  ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0  ; AVX512DQ-NEXT:    vpsllw $7, %ymm0, %ymm0 diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll index 44fe38fa86b9..4da23e539f6a 100644 --- a/test/CodeGen/X86/vector-half-conversions.ll +++ b/test/CodeGen/X86/vector-half-conversions.ll @@ -2195,8 +2195,7 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {  ; AVX512VL-NEXT:    shlq $32, %rdx  ; AVX512VL-NEXT:    orq %rcx, %rdx  ; AVX512VL-NEXT:    vmovq %rdx, %xmm0 -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u]  ; AVX512VL-NEXT:    retq    %1 = fptrunc <4 x float> %a0 to <4 x half>    %2 = bitcast <4 x half> %1 to <4 x i16> @@ -2205,108 +2204,30 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {  }  define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind { -; AVX1-LABEL: cvt_4f32_to_8i16_zero: -; AVX1:       # %bb.0: -; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT:    vmovd %xmm1, %eax -; AVX1-NEXT:    shll $16, %eax -; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1 -; AVX1-NEXT:    vmovd %xmm1, %ecx -; AVX1-NEXT:    movzwl %cx, %ecx -; AVX1-NEXT:    orl %eax, %ecx -; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT:    vmovd %xmm1, %eax -; AVX1-NEXT:    shll $16, %eax -; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT:    vmovd %xmm0, %edx -; AVX1-NEXT:    movzwl %dx, %edx -; AVX1-NEXT:    orl %eax, %edx -; AVX1-NEXT:    shlq $32, %rdx -; AVX1-NEXT:    orq %rcx, %rdx -; AVX1-NEXT:    vmovq %rdx, %xmm0 -; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT:    retq -; -; AVX2-LABEL: cvt_4f32_to_8i16_zero: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT:    vmovd %xmm1, %eax -; AVX2-NEXT:    shll $16, %eax -; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1 -; AVX2-NEXT:    vmovd %xmm1, %ecx -; AVX2-NEXT:    movzwl %cx, %ecx -; AVX2-NEXT:    orl %eax, %ecx -; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT:    vmovd %xmm1, %eax -; AVX2-NEXT:    shll $16, %eax -; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT:    vmovd %xmm0, %edx -; AVX2-NEXT:    movzwl %dx, %edx -; AVX2-NEXT:    orl %eax, %edx -; AVX2-NEXT:    shlq $32, %rdx -; AVX2-NEXT:    orq %rcx, %rdx -; AVX2-NEXT:    vmovq %rdx, %xmm0 -; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT:    retq -; -; AVX512F-LABEL: cvt_4f32_to_8i16_zero: -; AVX512F:       # %bb.0: -; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 -; AVX512F-NEXT:    vmovd %xmm1, %eax -; AVX512F-NEXT:    shll $16, %eax -; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm1 -; AVX512F-NEXT:    vmovd %xmm1, %ecx -; AVX512F-NEXT:    movzwl %cx, %ecx -; AVX512F-NEXT:    orl %eax, %ecx -; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512F-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 -; AVX512F-NEXT:    vmovd %xmm1, %eax -; AVX512F-NEXT:    shll $16, %eax -; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0 -; AVX512F-NEXT:    vmovd %xmm0, %edx -; AVX512F-NEXT:    movzwl %dx, %edx -; AVX512F-NEXT:    orl %eax, %edx -; AVX512F-NEXT:    shlq $32, %rdx -; AVX512F-NEXT:    orq %rcx, %rdx -; AVX512F-NEXT:    vmovq %rdx, %xmm0 -; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT:    retq -; -; AVX512VL-LABEL: cvt_4f32_to_8i16_zero: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT:    vmovd %xmm1, %eax -; AVX512VL-NEXT:    shll $16, %eax -; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1 -; AVX512VL-NEXT:    vmovd %xmm1, %ecx -; AVX512VL-NEXT:    movzwl %cx, %ecx -; AVX512VL-NEXT:    orl %eax, %ecx -; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT:    vmovd %xmm1, %eax -; AVX512VL-NEXT:    shll $16, %eax -; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT:    vmovd %xmm0, %edx -; AVX512VL-NEXT:    movzwl %dx, %edx -; AVX512VL-NEXT:    orl %eax, %edx -; AVX512VL-NEXT:    shlq $32, %rdx -; AVX512VL-NEXT:    orq %rcx, %rdx -; AVX512VL-NEXT:    vmovq %rdx, %xmm0 -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512VL-NEXT:    retq +; ALL-LABEL: cvt_4f32_to_8i16_zero: +; ALL:       # %bb.0: +; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT:    vmovd %xmm1, %eax +; ALL-NEXT:    shll $16, %eax +; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1 +; ALL-NEXT:    vmovd %xmm1, %ecx +; ALL-NEXT:    movzwl %cx, %ecx +; ALL-NEXT:    orl %eax, %ecx +; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT:    vmovd %xmm1, %eax +; ALL-NEXT:    shll $16, %eax +; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT:    vmovd %xmm0, %edx +; ALL-NEXT:    movzwl %dx, %edx +; ALL-NEXT:    orl %eax, %edx +; ALL-NEXT:    shlq $32, %rdx +; ALL-NEXT:    orq %rcx, %rdx +; ALL-NEXT:    vmovq %rdx, %xmm0 +; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; ALL-NEXT:    retq    %1 = fptrunc <4 x float> %a0 to <4 x half>    %2 = bitcast <4 x half> %1 to <4 x i16>    %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> @@ -2715,8 +2636,7 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw  ; AVX512VL-NEXT:    shlq $32, %rdx  ; AVX512VL-NEXT:    orq %rcx, %rdx  ; AVX512VL-NEXT:    vmovq %rdx, %xmm0 -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u]  ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rdi)  ; AVX512VL-NEXT:    retq    %1 = fptrunc <4 x float> %a0 to <4 x half> @@ -2727,112 +2647,31 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw  }  define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwind { -; AVX1-LABEL: store_cvt_4f32_to_8i16_zero: -; AVX1:       # %bb.0: -; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT:    vmovd %xmm1, %eax -; AVX1-NEXT:    shll $16, %eax -; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm1 -; AVX1-NEXT:    vmovd %xmm1, %ecx -; AVX1-NEXT:    movzwl %cx, %ecx -; AVX1-NEXT:    orl %eax, %ecx -; AVX1-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX1-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 -; AVX1-NEXT:    vmovd %xmm1, %eax -; AVX1-NEXT:    shll $16, %eax -; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT:    vcvtps2ph $4, %xmm0, %xmm0 -; AVX1-NEXT:    vmovd %xmm0, %edx -; AVX1-NEXT:    movzwl %dx, %edx -; AVX1-NEXT:    orl %eax, %edx -; AVX1-NEXT:    shlq $32, %rdx -; AVX1-NEXT:    orq %rcx, %rdx -; AVX1-NEXT:    vmovq %rdx, %xmm0 -; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT:    vmovdqa %xmm0, (%rdi) -; AVX1-NEXT:    retq -; -; AVX2-LABEL: store_cvt_4f32_to_8i16_zero: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT:    vmovd %xmm1, %eax -; AVX2-NEXT:    shll $16, %eax -; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm1 -; AVX2-NEXT:    vmovd %xmm1, %ecx -; AVX2-NEXT:    movzwl %cx, %ecx -; AVX2-NEXT:    orl %eax, %ecx -; AVX2-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX2-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 -; AVX2-NEXT:    vmovd %xmm1, %eax -; AVX2-NEXT:    shll $16, %eax -; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT:    vcvtps2ph $4, %xmm0, %xmm0 -; AVX2-NEXT:    vmovd %xmm0, %edx -; AVX2-NEXT:    movzwl %dx, %edx -; AVX2-NEXT:    orl %eax, %edx -; AVX2-NEXT:    shlq $32, %rdx -; AVX2-NEXT:    orq %rcx, %rdx -; AVX2-NEXT:    vmovq %rdx, %xmm0 -; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT:    vmovdqa %xmm0, (%rdi) -; AVX2-NEXT:    retq -; -; AVX512F-LABEL: store_cvt_4f32_to_8i16_zero: -; AVX512F:       # %bb.0: -; AVX512F-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 -; AVX512F-NEXT:    vmovd %xmm1, %eax -; AVX512F-NEXT:    shll $16, %eax -; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm1 -; AVX512F-NEXT:    vmovd %xmm1, %ecx -; AVX512F-NEXT:    movzwl %cx, %ecx -; AVX512F-NEXT:    orl %eax, %ecx -; AVX512F-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512F-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 -; AVX512F-NEXT:    vmovd %xmm1, %eax -; AVX512F-NEXT:    shll $16, %eax -; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0 -; AVX512F-NEXT:    vmovd %xmm0, %edx -; AVX512F-NEXT:    movzwl %dx, %edx -; AVX512F-NEXT:    orl %eax, %edx -; AVX512F-NEXT:    shlq $32, %rdx -; AVX512F-NEXT:    orq %rcx, %rdx -; AVX512F-NEXT:    vmovq %rdx, %xmm0 -; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT:    vmovdqa %xmm0, (%rdi) -; AVX512F-NEXT:    retq -; -; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT:    vmovd %xmm1, %eax -; AVX512VL-NEXT:    shll $16, %eax -; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1 -; AVX512VL-NEXT:    vmovd %xmm1, %ecx -; AVX512VL-NEXT:    movzwl %cx, %ecx -; AVX512VL-NEXT:    orl %eax, %ecx -; AVX512VL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512VL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 -; AVX512VL-NEXT:    vmovd %xmm1, %eax -; AVX512VL-NEXT:    shll $16, %eax -; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0 -; AVX512VL-NEXT:    vmovd %xmm0, %edx -; AVX512VL-NEXT:    movzwl %dx, %edx -; AVX512VL-NEXT:    orl %eax, %edx -; AVX512VL-NEXT:    shlq $32, %rdx -; AVX512VL-NEXT:    orq %rcx, %rdx -; AVX512VL-NEXT:    vmovq %rdx, %xmm0 -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512VL-NEXT:    vmovdqa %xmm0, (%rdi) -; AVX512VL-NEXT:    retq +; ALL-LABEL: store_cvt_4f32_to_8i16_zero: +; ALL:       # %bb.0: +; ALL-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT:    vmovd %xmm1, %eax +; ALL-NEXT:    shll $16, %eax +; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm1 +; ALL-NEXT:    vmovd %xmm1, %ecx +; ALL-NEXT:    movzwl %cx, %ecx +; ALL-NEXT:    orl %eax, %ecx +; ALL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; ALL-NEXT:    vcvtps2ph $4, %xmm1, %xmm1 +; ALL-NEXT:    vmovd %xmm1, %eax +; ALL-NEXT:    shll $16, %eax +; ALL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; ALL-NEXT:    vcvtps2ph $4, %xmm0, %xmm0 +; ALL-NEXT:    vmovd %xmm0, %edx +; ALL-NEXT:    movzwl %dx, %edx +; ALL-NEXT:    orl %eax, %edx +; ALL-NEXT:    shlq $32, %rdx +; ALL-NEXT:    orq %rcx, %rdx +; ALL-NEXT:    vmovq %rdx, %xmm0 +; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; ALL-NEXT:    vmovdqa %xmm0, (%rdi) +; ALL-NEXT:    retq    %1 = fptrunc <4 x float> %a0 to <4 x half>    %2 = bitcast <4 x half> %1 to <4 x i16>    %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> @@ -3389,8 +3228,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {  ; AVX512VL-NEXT:    shlq $32, %rax  ; AVX512VL-NEXT:    orq %r14, %rax  ; AVX512VL-NEXT:    vmovq %rax, %xmm0 -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u]  ; AVX512VL-NEXT:    addq $40, %rsp  ; AVX512VL-NEXT:    popq %rbx  ; AVX512VL-NEXT:    popq %r14 @@ -3478,84 +3316,43 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {  ; AVX2-NEXT:    popq %r14  ; AVX2-NEXT:    retq  ; -; AVX512F-LABEL: cvt_4f64_to_8i16_zero: -; AVX512F:       # %bb.0: -; AVX512F-NEXT:    pushq %r14 -; AVX512F-NEXT:    pushq %rbx -; AVX512F-NEXT:    subq $40, %rsp -; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT:    vzeroupper -; AVX512F-NEXT:    callq __truncdfhf2 -; AVX512F-NEXT:    movl %eax, %ebx -; AVX512F-NEXT:    shll $16, %ebx -; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512F-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX512F-NEXT:    vzeroupper -; AVX512F-NEXT:    callq __truncdfhf2 -; AVX512F-NEXT:    movzwl %ax, %r14d -; AVX512F-NEXT:    orl %ebx, %r14d -; AVX512F-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0 -; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT:    vzeroupper -; AVX512F-NEXT:    callq __truncdfhf2 -; AVX512F-NEXT:    movl %eax, %ebx -; AVX512F-NEXT:    shll $16, %ebx -; AVX512F-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512F-NEXT:    callq __truncdfhf2 -; AVX512F-NEXT:    movzwl %ax, %eax -; AVX512F-NEXT:    orl %ebx, %eax -; AVX512F-NEXT:    shlq $32, %rax -; AVX512F-NEXT:    orq %r14, %rax -; AVX512F-NEXT:    vmovq %rax, %xmm0 -; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT:    addq $40, %rsp -; AVX512F-NEXT:    popq %rbx -; AVX512F-NEXT:    popq %r14 -; AVX512F-NEXT:    retq -; -; AVX512VL-LABEL: cvt_4f64_to_8i16_zero: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    pushq %r14 -; AVX512VL-NEXT:    pushq %rbx -; AVX512VL-NEXT:    subq $40, %rsp -; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT:    vzeroupper -; AVX512VL-NEXT:    callq __truncdfhf2 -; AVX512VL-NEXT:    movl %eax, %ebx -; AVX512VL-NEXT:    shll $16, %ebx -; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX512VL-NEXT:    vzeroupper -; AVX512VL-NEXT:    callq __truncdfhf2 -; AVX512VL-NEXT:    movzwl %ax, %r14d -; AVX512VL-NEXT:    orl %ebx, %r14d -; AVX512VL-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT:    vzeroupper -; AVX512VL-NEXT:    callq __truncdfhf2 -; AVX512VL-NEXT:    movl %eax, %ebx -; AVX512VL-NEXT:    shll $16, %ebx -; AVX512VL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512VL-NEXT:    callq __truncdfhf2 -; AVX512VL-NEXT:    movzwl %ax, %eax -; AVX512VL-NEXT:    orl %ebx, %eax -; AVX512VL-NEXT:    shlq $32, %rax -; AVX512VL-NEXT:    orq %r14, %rax -; AVX512VL-NEXT:    vmovq %rax, %xmm0 -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512VL-NEXT:    addq $40, %rsp -; AVX512VL-NEXT:    popq %rbx -; AVX512VL-NEXT:    popq %r14 -; AVX512VL-NEXT:    retq +; AVX512-LABEL: cvt_4f64_to_8i16_zero: +; AVX512:       # %bb.0: +; AVX512-NEXT:    pushq %r14 +; AVX512-NEXT:    pushq %rbx +; AVX512-NEXT:    subq $40, %rsp +; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT:    vzeroupper +; AVX512-NEXT:    callq __truncdfhf2 +; AVX512-NEXT:    movl %eax, %ebx +; AVX512-NEXT:    shll $16, %ebx +; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512-NEXT:    vzeroupper +; AVX512-NEXT:    callq __truncdfhf2 +; AVX512-NEXT:    movzwl %ax, %r14d +; AVX512-NEXT:    orl %ebx, %r14d +; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT:    vzeroupper +; AVX512-NEXT:    callq __truncdfhf2 +; AVX512-NEXT:    movl %eax, %ebx +; AVX512-NEXT:    shll $16, %ebx +; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT:    callq __truncdfhf2 +; AVX512-NEXT:    movzwl %ax, %eax +; AVX512-NEXT:    orl %ebx, %eax +; AVX512-NEXT:    shlq $32, %rax +; AVX512-NEXT:    orq %r14, %rax +; AVX512-NEXT:    vmovq %rax, %xmm0 +; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT:    addq $40, %rsp +; AVX512-NEXT:    popq %rbx +; AVX512-NEXT:    popq %r14 +; AVX512-NEXT:    retq    %1 = fptrunc <4 x double> %a0 to <4 x half>    %2 = bitcast <4 x half> %1 to <4 x i16>    %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> @@ -4095,8 +3892,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun  ; AVX512VL-NEXT:    shlq $32, %rax  ; AVX512VL-NEXT:    orq %rbx, %rax  ; AVX512VL-NEXT:    vmovq %rax, %xmm0 -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u]  ; AVX512VL-NEXT:    vmovdqa %xmm0, (%r14)  ; AVX512VL-NEXT:    addq $32, %rsp  ; AVX512VL-NEXT:    popq %rbx @@ -4195,92 +3991,47 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw  ; AVX2-NEXT:    popq %rbp  ; AVX2-NEXT:    retq  ; -; AVX512F-LABEL: store_cvt_4f64_to_8i16_zero: -; AVX512F:       # %bb.0: -; AVX512F-NEXT:    pushq %rbp -; AVX512F-NEXT:    pushq %r14 -; AVX512F-NEXT:    pushq %rbx -; AVX512F-NEXT:    subq $32, %rsp -; AVX512F-NEXT:    movq %rdi, %r14 -; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT:    vzeroupper -; AVX512F-NEXT:    callq __truncdfhf2 -; AVX512F-NEXT:    movl %eax, %ebp -; AVX512F-NEXT:    shll $16, %ebp -; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512F-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX512F-NEXT:    vzeroupper -; AVX512F-NEXT:    callq __truncdfhf2 -; AVX512F-NEXT:    movzwl %ax, %ebx -; AVX512F-NEXT:    orl %ebp, %ebx -; AVX512F-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0 -; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT:    vzeroupper -; AVX512F-NEXT:    callq __truncdfhf2 -; AVX512F-NEXT:    movl %eax, %ebp -; AVX512F-NEXT:    shll $16, %ebp -; AVX512F-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512F-NEXT:    callq __truncdfhf2 -; AVX512F-NEXT:    movzwl %ax, %eax -; AVX512F-NEXT:    orl %ebp, %eax -; AVX512F-NEXT:    shlq $32, %rax -; AVX512F-NEXT:    orq %rbx, %rax -; AVX512F-NEXT:    vmovq %rax, %xmm0 -; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT:    vmovdqa %xmm0, (%r14) -; AVX512F-NEXT:    addq $32, %rsp -; AVX512F-NEXT:    popq %rbx -; AVX512F-NEXT:    popq %r14 -; AVX512F-NEXT:    popq %rbp -; AVX512F-NEXT:    retq -; -; AVX512VL-LABEL: store_cvt_4f64_to_8i16_zero: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    pushq %rbp -; AVX512VL-NEXT:    pushq %r14 -; AVX512VL-NEXT:    pushq %rbx -; AVX512VL-NEXT:    subq $32, %rsp -; AVX512VL-NEXT:    movq %rdi, %r14 -; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill -; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT:    vzeroupper -; AVX512VL-NEXT:    callq __truncdfhf2 -; AVX512VL-NEXT:    movl %eax, %ebp -; AVX512VL-NEXT:    shll $16, %ebp -; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX512VL-NEXT:    vzeroupper -; AVX512VL-NEXT:    callq __truncdfhf2 -; AVX512VL-NEXT:    movzwl %ax, %ebx -; AVX512VL-NEXT:    orl %ebp, %ebx -; AVX512VL-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload -; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill -; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT:    vzeroupper -; AVX512VL-NEXT:    callq __truncdfhf2 -; AVX512VL-NEXT:    movl %eax, %ebp -; AVX512VL-NEXT:    shll $16, %ebp -; AVX512VL-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload -; AVX512VL-NEXT:    callq __truncdfhf2 -; AVX512VL-NEXT:    movzwl %ax, %eax -; AVX512VL-NEXT:    orl %ebp, %eax -; AVX512VL-NEXT:    shlq $32, %rax -; AVX512VL-NEXT:    orq %rbx, %rax -; AVX512VL-NEXT:    vmovq %rax, %xmm0 -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2] -; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX512VL-NEXT:    vmovdqa %xmm0, (%r14) -; AVX512VL-NEXT:    addq $32, %rsp -; AVX512VL-NEXT:    popq %rbx -; AVX512VL-NEXT:    popq %r14 -; AVX512VL-NEXT:    popq %rbp -; AVX512VL-NEXT:    retq +; AVX512-LABEL: store_cvt_4f64_to_8i16_zero: +; AVX512:       # %bb.0: +; AVX512-NEXT:    pushq %rbp +; AVX512-NEXT:    pushq %r14 +; AVX512-NEXT:    pushq %rbx +; AVX512-NEXT:    subq $32, %rsp +; AVX512-NEXT:    movq %rdi, %r14 +; AVX512-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT:    vzeroupper +; AVX512-NEXT:    callq __truncdfhf2 +; AVX512-NEXT:    movl %eax, %ebp +; AVX512-NEXT:    shll $16, %ebp +; AVX512-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512-NEXT:    vzeroupper +; AVX512-NEXT:    callq __truncdfhf2 +; AVX512-NEXT:    movzwl %ax, %ebx +; AVX512-NEXT:    orl %ebp, %ebx +; AVX512-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload +; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0 +; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT:    vzeroupper +; AVX512-NEXT:    callq __truncdfhf2 +; AVX512-NEXT:    movl %eax, %ebp +; AVX512-NEXT:    shll $16, %ebp +; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT:    callq __truncdfhf2 +; AVX512-NEXT:    movzwl %ax, %eax +; AVX512-NEXT:    orl %ebp, %eax +; AVX512-NEXT:    shlq $32, %rax +; AVX512-NEXT:    orq %rbx, %rax +; AVX512-NEXT:    vmovq %rax, %xmm0 +; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT:    vmovdqa %xmm0, (%r14) +; AVX512-NEXT:    addq $32, %rsp +; AVX512-NEXT:    popq %rbx +; AVX512-NEXT:    popq %r14 +; AVX512-NEXT:    popq %rbp +; AVX512-NEXT:    retq    %1 = fptrunc <4 x double> %a0 to <4 x half>    %2 = bitcast <4 x half> %1 to <4 x i16>    %3 = shufflevector <4 x i16> %2, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll index b40c9eddd46b..8af96c168be6 100644 --- a/test/CodeGen/X86/vector-rotate-128.ll +++ b/test/CodeGen/X86/vector-rotate-128.ll @@ -699,20 +699,35 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {  ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0  ; AVX-NEXT:    retq  ; -; AVX512-LABEL: var_rotate_v16i8: -; AVX512:       # %bb.0: -; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512-NEXT:    vpsubb %xmm1, %xmm2, %xmm2 -; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT:    vpsllvd %zmm1, %zmm0, %zmm1 -; AVX512-NEXT:    vpmovdb %zmm1, %xmm1 -; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero -; AVX512-NEXT:    vpsrlvd %zmm2, %zmm0, %zmm0 -; AVX512-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; AVX512BW-LABEL: var_rotate_v16i8: +; AVX512BW:       # %bb.0: +; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT:    vpsubb %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT:    vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT:    vzeroupper +; AVX512BW-NEXT:    retq +; +; AVX512VL-LABEL: var_rotate_v16i8: +; AVX512VL:       # %bb.0: +; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT:    vpsubb %xmm1, %xmm2, %xmm2 +; AVX512VL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512VL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VL-NEXT:    vpsllvw %ymm1, %ymm0, %ymm1 +; AVX512VL-NEXT:    vpmovwb %ymm1, %xmm1 +; AVX512VL-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero +; AVX512VL-NEXT:    vpsrlvw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT:    vpmovwb %ymm0, %xmm0 +; AVX512VL-NEXT:    vpor %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT:    vzeroupper +; AVX512VL-NEXT:    retq  ;  ; XOP-LABEL: var_rotate_v16i8:  ; XOP:       # %bb.0: @@ -1249,16 +1264,29 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {  ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0  ; AVX-NEXT:    retq  ; -; AVX512-LABEL: constant_rotate_v16i8: -; AVX512:       # %bb.0: -; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm1 -; AVX512-NEXT:    vpmovdb %zmm1, %xmm1 -; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; AVX512BW-LABEL: constant_rotate_v16i8: +; AVX512BW:       # %bb.0: +; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] +; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1 +; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] +; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT:    vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT:    vzeroupper +; AVX512BW-NEXT:    retq +; +; AVX512VL-LABEL: constant_rotate_v16i8: +; AVX512VL:       # %bb.0: +; AVX512VL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512VL-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm1 +; AVX512VL-NEXT:    vpmovwb %ymm1, %xmm1 +; AVX512VL-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT:    vpmovwb %ymm0, %xmm0 +; AVX512VL-NEXT:    vpor %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT:    vzeroupper +; AVX512VL-NEXT:    retq  ;  ; XOP-LABEL: constant_rotate_v16i8:  ; XOP:       # %bb.0: diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll index ea33f22cc07a..ca670f40ab3f 100644 --- a/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -531,23 +531,42 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {  ; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0  ; XOP-NEXT:    retq  ; -; AVX512-LABEL: var_shift_v16i8: -; AVX512:       # %bb.0: -; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; AVX512DQ-LABEL: var_shift_v16i8: +; AVX512DQ:       # %bb.0: +; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT:    vzeroupper +; AVX512DQ-NEXT:    retq  ; -; AVX512VL-LABEL: var_shift_v16i8: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0 -; AVX512VL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT:    vzeroupper -; AVX512VL-NEXT:    retq +; AVX512BW-LABEL: var_shift_v16i8: +; AVX512BW:       # %bb.0: +; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT:    vzeroupper +; AVX512BW-NEXT:    retq +; +; AVX512DQVL-LABEL: var_shift_v16i8: +; AVX512DQVL:       # %bb.0: +; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT:    vzeroupper +; AVX512DQVL-NEXT:    retq +; +; AVX512BWVL-LABEL: var_shift_v16i8: +; AVX512BWVL:       # %bb.0: +; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT:    vzeroupper +; AVX512BWVL-NEXT:    retq  ;  ; X32-SSE-LABEL: var_shift_v16i8:  ; X32-SSE:       # %bb.0: @@ -948,25 +967,46 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {  ; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0  ; XOPAVX2-NEXT:    retq  ; -; AVX512-LABEL: splatvar_shift_v16i8: -; AVX512:       # %bb.0: -; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1 -; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; AVX512DQ-LABEL: splatvar_shift_v16i8: +; AVX512DQ:       # %bb.0: +; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT:    vzeroupper +; AVX512DQ-NEXT:    retq  ; -; AVX512VL-LABEL: splatvar_shift_v16i8: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1 -; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0 -; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT:    vzeroupper -; AVX512VL-NEXT:    retq +; AVX512BW-LABEL: splatvar_shift_v16i8: +; AVX512BW:       # %bb.0: +; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT:    vzeroupper +; AVX512BW-NEXT:    retq +; +; AVX512DQVL-LABEL: splatvar_shift_v16i8: +; AVX512DQVL:       # %bb.0: +; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT:    vzeroupper +; AVX512DQVL-NEXT:    retq +; +; AVX512BWVL-LABEL: splatvar_shift_v16i8: +; AVX512BWVL:       # %bb.0: +; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT:    vzeroupper +; AVX512BWVL-NEXT:    retq  ;  ; X32-SSE-LABEL: splatvar_shift_v16i8:  ; X32-SSE:       # %bb.0: @@ -1441,21 +1481,39 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {  ; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0  ; XOP-NEXT:    retq  ; -; AVX512-LABEL: constant_shift_v16i8: -; AVX512:       # %bb.0: -; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; AVX512DQ-LABEL: constant_shift_v16i8: +; AVX512DQ:       # %bb.0: +; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0 +; AVX512DQ-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT:    vzeroupper +; AVX512DQ-NEXT:    retq  ; -; AVX512VL-LABEL: constant_shift_v16i8: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0 -; AVX512VL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT:    vzeroupper -; AVX512VL-NEXT:    retq +; AVX512BW-LABEL: constant_shift_v16i8: +; AVX512BW:       # %bb.0: +; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0 +; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT:    vzeroupper +; AVX512BW-NEXT:    retq +; +; AVX512DQVL-LABEL: constant_shift_v16i8: +; AVX512DQVL:       # %bb.0: +; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0 +; AVX512DQVL-NEXT:    vpsravd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT:    vzeroupper +; AVX512DQVL-NEXT:    retq +; +; AVX512BWVL-LABEL: constant_shift_v16i8: +; AVX512BWVL:       # %bb.0: +; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0 +; AVX512BWVL-NEXT:    vpsravw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT:    vzeroupper +; AVX512BWVL-NEXT:    retq  ;  ; X32-SSE-LABEL: constant_shift_v16i8:  ; X32-SSE:       # %bb.0: diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll index 307cf287219d..890cedf97c9d 100644 --- a/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -451,23 +451,42 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {  ; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0  ; XOP-NEXT:    retq  ; -; AVX512-LABEL: var_shift_v16i8: -; AVX512:       # %bb.0: -; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; AVX512DQ-LABEL: var_shift_v16i8: +; AVX512DQ:       # %bb.0: +; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT:    vzeroupper +; AVX512DQ-NEXT:    retq  ; -; AVX512VL-LABEL: var_shift_v16i8: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT:    vzeroupper -; AVX512VL-NEXT:    retq +; AVX512BW-LABEL: var_shift_v16i8: +; AVX512BW:       # %bb.0: +; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT:    vzeroupper +; AVX512BW-NEXT:    retq +; +; AVX512DQVL-LABEL: var_shift_v16i8: +; AVX512DQVL:       # %bb.0: +; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT:    vzeroupper +; AVX512DQVL-NEXT:    retq +; +; AVX512BWVL-LABEL: var_shift_v16i8: +; AVX512BWVL:       # %bb.0: +; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT:    vzeroupper +; AVX512BWVL-NEXT:    retq  ;  ; X32-SSE-LABEL: var_shift_v16i8:  ; X32-SSE:       # %bb.0: @@ -753,25 +772,46 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {  ; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0  ; XOPAVX2-NEXT:    retq  ; -; AVX512-LABEL: splatvar_shift_v16i8: -; AVX512:       # %bb.0: -; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1 -; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; AVX512DQ-LABEL: splatvar_shift_v16i8: +; AVX512DQ:       # %bb.0: +; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT:    vzeroupper +; AVX512DQ-NEXT:    retq  ; -; AVX512VL-LABEL: splatvar_shift_v16i8: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1 -; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT:    vzeroupper -; AVX512VL-NEXT:    retq +; AVX512BW-LABEL: splatvar_shift_v16i8: +; AVX512BW:       # %bb.0: +; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT:    vzeroupper +; AVX512BW-NEXT:    retq +; +; AVX512DQVL-LABEL: splatvar_shift_v16i8: +; AVX512DQVL:       # %bb.0: +; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT:    vzeroupper +; AVX512DQVL-NEXT:    retq +; +; AVX512BWVL-LABEL: splatvar_shift_v16i8: +; AVX512BWVL:       # %bb.0: +; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT:    vzeroupper +; AVX512BWVL-NEXT:    retq  ;  ; X32-SSE-LABEL: splatvar_shift_v16i8:  ; X32-SSE:       # %bb.0: @@ -1148,21 +1188,39 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {  ; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0  ; XOP-NEXT:    retq  ; -; AVX512-LABEL: constant_shift_v16i8: -; AVX512:       # %bb.0: -; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; AVX512DQ-LABEL: constant_shift_v16i8: +; AVX512DQ:       # %bb.0: +; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT:    vzeroupper +; AVX512DQ-NEXT:    retq  ; -; AVX512VL-LABEL: constant_shift_v16i8: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT:    vzeroupper -; AVX512VL-NEXT:    retq +; AVX512BW-LABEL: constant_shift_v16i8: +; AVX512BW:       # %bb.0: +; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT:    vzeroupper +; AVX512BW-NEXT:    retq +; +; AVX512DQVL-LABEL: constant_shift_v16i8: +; AVX512DQVL:       # %bb.0: +; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT:    vzeroupper +; AVX512DQVL-NEXT:    retq +; +; AVX512BWVL-LABEL: constant_shift_v16i8: +; AVX512BWVL:       # %bb.0: +; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT:    vzeroupper +; AVX512BWVL-NEXT:    retq  ;  ; X32-SSE-LABEL: constant_shift_v16i8:  ; X32-SSE:       # %bb.0: diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll index b518ad5fcffd..9481e46c0c52 100644 --- a/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/test/CodeGen/X86/vector-shift-shl-128.ll @@ -401,23 +401,42 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {  ; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0  ; XOP-NEXT:    retq  ; -; AVX512-LABEL: var_shift_v16i8: -; AVX512:       # %bb.0: -; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; AVX512DQ-LABEL: var_shift_v16i8: +; AVX512DQ:       # %bb.0: +; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT:    vzeroupper +; AVX512DQ-NEXT:    retq  ; -; AVX512VL-LABEL: var_shift_v16i8: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT:    vzeroupper -; AVX512VL-NEXT:    retq +; AVX512BW-LABEL: var_shift_v16i8: +; AVX512BW:       # %bb.0: +; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT:    vzeroupper +; AVX512BW-NEXT:    retq +; +; AVX512DQVL-LABEL: var_shift_v16i8: +; AVX512DQVL:       # %bb.0: +; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT:    vzeroupper +; AVX512DQVL-NEXT:    retq +; +; AVX512BWVL-LABEL: var_shift_v16i8: +; AVX512BWVL:       # %bb.0: +; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT:    vzeroupper +; AVX512BWVL-NEXT:    retq  ;  ; X32-SSE-LABEL: var_shift_v16i8:  ; X32-SSE:       # %bb.0: @@ -695,25 +714,46 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {  ; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0  ; XOPAVX2-NEXT:    retq  ; -; AVX512-LABEL: splatvar_shift_v16i8: -; AVX512:       # %bb.0: -; AVX512-NEXT:    vpbroadcastb %xmm1, %xmm1 -; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; AVX512DQ-LABEL: splatvar_shift_v16i8: +; AVX512DQ:       # %bb.0: +; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %xmm1 +; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQ-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT:    vzeroupper +; AVX512DQ-NEXT:    retq  ; -; AVX512VL-LABEL: splatvar_shift_v16i8: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1 -; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT:    vzeroupper -; AVX512VL-NEXT:    retq +; AVX512BW-LABEL: splatvar_shift_v16i8: +; AVX512BW:       # %bb.0: +; AVX512BW-NEXT:    vpbroadcastb %xmm1, %xmm1 +; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT:    vzeroupper +; AVX512BW-NEXT:    retq +; +; AVX512DQVL-LABEL: splatvar_shift_v16i8: +; AVX512DQVL:       # %bb.0: +; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %xmm1 +; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero +; AVX512DQVL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 +; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT:    vzeroupper +; AVX512DQVL-NEXT:    retq +; +; AVX512BWVL-LABEL: splatvar_shift_v16i8: +; AVX512BWVL:       # %bb.0: +; AVX512BWVL-NEXT:    vpbroadcastb %xmm1, %xmm1 +; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero +; AVX512BWVL-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0 +; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT:    vzeroupper +; AVX512BWVL-NEXT:    retq  ;  ; X32-SSE-LABEL: splatvar_shift_v16i8:  ; X32-SSE:       # %bb.0: @@ -998,21 +1038,39 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {  ; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm0  ; XOP-NEXT:    retq  ; -; AVX512-LABEL: constant_shift_v16i8: -; AVX512:       # %bb.0: -; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; AVX512DQ-LABEL: constant_shift_v16i8: +; AVX512DQ:       # %bb.0: +; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT:    vzeroupper +; AVX512DQ-NEXT:    retq  ; -; AVX512VL-LABEL: constant_shift_v16i8: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512VL-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT:    vzeroupper -; AVX512VL-NEXT:    retq +; AVX512BW-LABEL: constant_shift_v16i8: +; AVX512BW:       # %bb.0: +; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0] +; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX512BW-NEXT:    vzeroupper +; AVX512BW-NEXT:    retq +; +; AVX512DQVL-LABEL: constant_shift_v16i8: +; AVX512DQVL:       # %bb.0: +; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT:    vzeroupper +; AVX512DQVL-NEXT:    retq +; +; AVX512BWVL-LABEL: constant_shift_v16i8: +; AVX512BWVL:       # %bb.0: +; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT:    vzeroupper +; AVX512BWVL-NEXT:    retq  ;  ; X32-SSE-LABEL: constant_shift_v16i8:  ; X32-SSE:       # %bb.0: diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll index 2fcbd89b857e..2f5a2b116115 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -3,8 +3,10 @@  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLVBMI  define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {  ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: @@ -58,17 +60,10 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01(  ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]  ; SSE41-NEXT:    retq  ; -; AVX1OR2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: -; AVX1OR2:       # %bb.0: -; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX1OR2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-NEXT:    retq +; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: +; AVX:       # %bb.0: +; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX-NEXT:    retq    %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>    ret <16 x i8> %shuffle  } @@ -93,17 +88,10 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(  ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8]  ; SSE41-NEXT:    retq  ; -; AVX1OR2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: -; AVX1OR2:       # %bb.0: -; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] -; AVX1OR2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,1,1,8,8,9,9,8,8,9,9,10,10,11,11] -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-NEXT:    retq +; AVX-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; AVX:       # %bb.0: +; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] +; AVX-NEXT:    retq    %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>    ret <16 x i8> %shuffle  } @@ -115,11 +103,27 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03(  ; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]  ; SSE-NEXT:    retq  ; -; AVX-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: -; AVX:       # %bb.0: -; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; AVX-NEXT:    retq +; AVX1-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-LABEL: shuffle_v16i8_00_00_00_00_01_01_01_01_02_02_02_02_03_03_03_03: +; AVX512VL:       # %bb.0: +; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3] +; AVX512VL-NEXT:    retq    %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>    ret <16 x i8> %shuffle  } @@ -131,11 +135,27 @@ define <16 x i8> @shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07(  ; SSE-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]  ; SSE-NEXT:    retq  ; -; AVX-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: -; AVX:       # %bb.0: -; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; AVX-NEXT:    retq +; AVX1-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-LABEL: shuffle_v16i8_04_04_04_04_05_05_05_05_06_06_06_06_07_07_07_07: +; AVX512VL:       # %bb.0: +; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] +; AVX512VL-NEXT:    retq    %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7>    ret <16 x i8> %shuffle  } @@ -1203,12 +1223,25 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(  ; SSE41-NEXT:    por %xmm1, %xmm0  ; SSE41-NEXT:    retq  ; -; AVX-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: -; AVX:       # %bb.0: # %entry -; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero -; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] -; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT:    retq +; AVX1OR2-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; AVX1OR2:       # %bb.0: # %entry +; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero +; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] +; AVX1OR2-NEXT:    vpor %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT:    retq +; +; AVX512VLBW-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; AVX512VLBW:       # %bb.0: # %entry +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[2],zero,zero,zero +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,10,2,7],zero,xmm0[14,7,2],zero,xmm0[3,1,14],zero,xmm0[9,11,0] +; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0 +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00: +; AVX512VLVBMI:       # %bb.0: # %entry +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,10,2,7,22,14,7,2,18,3,1,14,18,9,11,0> +; AVX512VLVBMI-NEXT:    vpermt2b %xmm1, %xmm2, %xmm0 +; AVX512VLVBMI-NEXT:    retq  entry:    %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 undef, i32 10, i32 2, i32 7, i32 22, i32 14, i32 7, i32 2, i32 18, i32 3, i32 1, i32 14, i32 18, i32 9, i32 11, i32 0> diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 60bc36948d23..072d71fae570 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -3,8 +3,10 @@  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST  define <8 x i16> @shuffle_v8i16_01012323(<8 x i16> %a, <8 x i16> %b) {  ; SSE-LABEL: shuffle_v8i16_01012323: @@ -85,11 +87,33 @@ define <8 x i16> @shuffle_v8i16_00004444(<8 x i16> %a, <8 x i16> %b) {  ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]  ; SSE-NEXT:    retq  ; -; AVX-LABEL: shuffle_v8i16_00004444: -; AVX:       # %bb.0: -; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX-NEXT:    retq +; AVX1-LABEL: shuffle_v8i16_00004444: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_00004444: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_00004444: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_00004444: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_00004444: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>    ret <8 x i16> %shuffle  } @@ -126,11 +150,33 @@ define <8 x i16> @shuffle_v8i16_31206745(<8 x i16> %a, <8 x i16> %b) {  ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]  ; SSE-NEXT:    retq  ; -; AVX-LABEL: shuffle_v8i16_31206745: -; AVX:       # %bb.0: -; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] -; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX-NEXT:    retq +; AVX1-LABEL: shuffle_v8i16_31206745: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_31206745: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_31206745: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,0,1,12,13,14,15,8,9,10,11] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_31206745: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] +; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_31206745: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,0,1,12,13,14,15,8,9,10,11] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 6, i32 7, i32 4, i32 5>    ret <8 x i16> %shuffle  } @@ -179,11 +225,33 @@ define <8 x i16> @shuffle_v8i16_23026745(<8 x i16> %a, <8 x i16> %b) {  ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2]  ; SSE-NEXT:    retq  ; -; AVX-LABEL: shuffle_v8i16_23026745: -; AVX:       # %bb.0: -; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] -; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX-NEXT:    retq +; AVX1-LABEL: shuffle_v8i16_23026745: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_23026745: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_23026745: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,4,5,12,13,14,15,8,9,10,11] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_23026745: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] +; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_23026745: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,4,5,12,13,14,15,8,9,10,11] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 2, i32 6, i32 7, i32 4, i32 5>    ret <8 x i16> %shuffle  } @@ -194,11 +262,33 @@ define <8 x i16> @shuffle_v8i16_23016747(<8 x i16> %a, <8 x i16> %b) {  ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7]  ; SSE-NEXT:    retq  ; -; AVX-LABEL: shuffle_v8i16_23016747: -; AVX:       # %bb.0: -; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] -; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7] -; AVX-NEXT:    retq +; AVX1-LABEL: shuffle_v8i16_23016747: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7] +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_23016747: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_23016747: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3,12,13,14,15,8,9,14,15] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_23016747: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,7] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_23016747: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3,12,13,14,15,8,9,14,15] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 7>    ret <8 x i16> %shuffle  } @@ -597,11 +687,33 @@ define <8 x i16> @shuffle_v8i16_04404567(<8 x i16> %a, <8 x i16> %b) {  ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]  ; SSE-NEXT:    retq  ; -; AVX-LABEL: shuffle_v8i16_04404567: -; AVX:       # %bb.0: -; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] -; AVX-NEXT:    retq +; AVX1-LABEL: shuffle_v8i16_04404567: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_04404567: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_04404567: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_04404567: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_04404567: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 4, i32 4, i32 0, i32 4, i32 5, i32 6, i32 7>    ret <8 x i16> %shuffle  } @@ -700,17 +812,10 @@ define <8 x i16> @shuffle_v8i16_0127XXXX(<8 x i16> %a, <8 x i16> %b) {  ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15]  ; SSE41-NEXT:    retq  ; -; AVX1OR2-LABEL: shuffle_v8i16_0127XXXX: -; AVX1OR2:       # %bb.0: -; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15] -; AVX1OR2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v8i16_0127XXXX: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT:    retq +; AVX-LABEL: shuffle_v8i16_0127XXXX: +; AVX:       # %bb.0: +; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15] +; AVX-NEXT:    retq    %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>    ret <8 x i16> %shuffle  } @@ -733,17 +838,10 @@ define <8 x i16> @shuffle_v8i16_XXXX4563(<8 x i16> %a, <8 x i16> %b) {  ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7]  ; SSE41-NEXT:    retq  ; -; AVX1OR2-LABEL: shuffle_v8i16_XXXX4563: -; AVX1OR2:       # %bb.0: -; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7] -; AVX1OR2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v8i16_XXXX4563: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX512VL-NEXT:    retq +; AVX-LABEL: shuffle_v8i16_XXXX4563: +; AVX:       # %bb.0: +; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,4,5,6,7,8,9,10,11,12,13,6,7] +; AVX-NEXT:    retq    %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 4, i32 5, i32 6, i32 3>    ret <8 x i16> %shuffle  } @@ -766,17 +864,10 @@ define <8 x i16> @shuffle_v8i16_4563XXXX(<8 x i16> %a, <8 x i16> %b) {  ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3]  ; SSE41-NEXT:    retq  ; -; AVX1OR2-LABEL: shuffle_v8i16_4563XXXX: -; AVX1OR2:       # %bb.0: -; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3] -; AVX1OR2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v8i16_4563XXXX: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] -; AVX512VL-NEXT:    retq +; AVX-LABEL: shuffle_v8i16_4563XXXX: +; AVX:       # %bb.0: +; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3] +; AVX-NEXT:    retq    %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>    ret <8 x i16> %shuffle  } @@ -799,17 +890,10 @@ define <8 x i16> @shuffle_v8i16_01274563(<8 x i16> %a, <8 x i16> %b) {  ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7]  ; SSE41-NEXT:    retq  ; -; AVX1OR2-LABEL: shuffle_v8i16_01274563: -; AVX1OR2:       # %bb.0: -; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7] -; AVX1OR2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v8i16_01274563: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,2] -; AVX512VL-NEXT:    retq +; AVX-LABEL: shuffle_v8i16_01274563: +; AVX:       # %bb.0: +; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,14,15,8,9,10,11,12,13,6,7] +; AVX-NEXT:    retq    %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6, i32 3>    ret <8 x i16> %shuffle  } @@ -832,17 +916,10 @@ define <8 x i16> @shuffle_v8i16_45630127(<8 x i16> %a, <8 x i16> %b) {  ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15]  ; SSE41-NEXT:    retq  ; -; AVX1OR2-LABEL: shuffle_v8i16_45630127: -; AVX1OR2:       # %bb.0: -; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15] -; AVX1OR2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v8i16_45630127: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,0] -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,0,3,1] -; AVX512VL-NEXT:    retq +; AVX-LABEL: shuffle_v8i16_45630127: +; AVX:       # %bb.0: +; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,6,7,0,1,2,3,4,5,14,15] +; AVX-NEXT:    retq    %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 3, i32 0, i32 1, i32 2, i32 7>    ret <8 x i16> %shuffle  } @@ -980,12 +1057,38 @@ define <8 x i16> @shuffle_v8i16_109832ba(<8 x i16> %a, <8 x i16> %b) {  ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5]  ; SSE-NEXT:    retq  ; -; AVX-LABEL: shuffle_v8i16_109832ba: -; AVX:       # %bb.0: -; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] -; AVX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] -; AVX-NEXT:    retq +; AVX1-LABEL: shuffle_v8i16_109832ba: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] +; AVX1-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_109832ba: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_109832ba: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,6,7,2,3,12,13,8,9,14,15,10,11] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_109832ba: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,3,1,4,5,6,7] +; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,7,5] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_109832ba: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,6,7,2,3,12,13,8,9,14,15,10,11] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 0, i32 9, i32 8, i32 3, i32 2, i32 11, i32 10>    ret <8 x i16> %shuffle  } @@ -1028,13 +1131,43 @@ define <8 x i16> @shuffle_v8i16_0213cedf(<8 x i16> %a, <8 x i16> %b) {  ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]  ; SSE-NEXT:    retq  ; -; AVX-LABEL: shuffle_v8i16_0213cedf: -; AVX:       # %bb.0: -; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] -; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT:    retq +; AVX1-LABEL: shuffle_v8i16_0213cedf: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] +; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_0213cedf: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_0213cedf: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_0213cedf: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,3,4,5,6,7] +; AVX512VL-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_0213cedf: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11,14,15,8,9,10,11,12,13,14,15] +; AVX512VL-FAST-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] +; AVX512VL-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 1, i32 3, i32 12, i32 14, i32 13, i32 15>    ret <8 x i16> %shuffle  } @@ -1064,12 +1197,38 @@ define <8 x i16> @shuffle_v8i16_443aXXXX(<8 x i16> %a, <8 x i16> %b) {  ; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]  ; SSE41-NEXT:    retq  ; -; AVX-LABEL: shuffle_v8i16_443aXXXX: -; AVX:       # %bb.0: -; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] -; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] -; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] -; AVX-NEXT:    retq +; AVX1-LABEL: shuffle_v8i16_443aXXXX: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: shuffle_v8i16_443aXXXX: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_443aXXXX: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_443aXXXX: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_443aXXXX: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 4, i32 3, i32 10, i32 undef, i32 undef, i32 undef, i32 undef>    ret <8 x i16> %shuffle  } @@ -1336,13 +1495,35 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) {  ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v8i16_XXX1X579: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpbroadcastd %xmm1, %xmm1 -; AVX2OR512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX2OR512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] -; AVX2OR512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v8i16_XXX1X579: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpbroadcastd %xmm1, %xmm1 +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] +; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8i16_XXX1X579: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpbroadcastd %xmm1, %xmm1 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15] +; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i16_XXX1X579: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpbroadcastd %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] +; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] +; AVX512VL-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i16_XXX1X579: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpbroadcastd %xmm1, %xmm1 +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15] +; AVX512VL-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 5, i32 7, i32 9>    ret <8 x i16> %shuffle  } diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll index 11f25a2d687d..cbd1b83a4eb2 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1,7 +1,9 @@  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST  define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) {  ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: @@ -156,15 +158,24 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpbroadcastw %xmm0, %xmm1 -; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpbroadcastw %xmm0, %xmm1 +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpbroadcastw %xmm0, %xmm1 +; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:  ; AVX512VL:       # %bb.0: @@ -214,12 +225,19 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:  ; AVX512VL:       # %bb.0: @@ -241,12 +259,19 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:  ; AVX512VL:       # %bb.0: @@ -373,11 +398,27 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2OR512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>    ret <16 x i16> %shuffle  } @@ -393,11 +434,27 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2OR512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>    ret <16 x i16> %shuffle  } @@ -413,11 +470,27 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_1  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2OR512VL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>    ret <16 x i16> %shuffle  } @@ -433,11 +506,27 @@ define <16 x i16> @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_1  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] -; AVX2OR512VL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15] +; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15>    ret <16 x i16> %shuffle  } @@ -453,11 +542,27 @@ define <16 x i16> @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_1  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15] -; AVX2OR512VL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14] -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,2,4,5,6,7,8,8,10,10,12,13,14,15] +; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,6,8,9,10,11,12,12,14,14] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>    ret <16 x i16> %shuffle  } @@ -473,11 +578,27 @@ define <16 x i16> @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_1  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15] -; AVX2OR512VL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15] -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15] +; AVX512VL-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,7,7,8,9,10,11,13,13,15,15] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>    ret <16 x i16> %shuffle  } @@ -789,13 +910,20 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] -; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24:  ; AVX512VL:       # %bb.0: @@ -850,12 +978,18 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_1  ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] -; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,4,5,2,3,0,1,14,15,12,13,10,11,8,9,22,23,20,21,18,19,16,17,30,31,28,29,26,27,24,25] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:  ; AVX512VL:       # %bb.0: @@ -880,13 +1014,20 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_0  ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] -; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] -; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,4,8,9,10,11,15,14,13,12] +; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,6,7,4,5,2,3,0,1,16,17,18,19,20,21,22,23,22,23,20,21,18,19,16,17] +; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[3,2,1,0,4,5,6,7,11,10,9,8,12,13,14,15] +; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08:  ; AVX512VL:       # %bb.0: @@ -1349,12 +1490,18 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_2  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_16_16_16_16_20_20_20_20:  ; AVX512VL:       # %bb.0: @@ -1376,12 +1523,18 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_2  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_16_16_16_16_20_20_20_20:  ; AVX512VL:       # %bb.0: @@ -1404,12 +1557,18 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_2  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_08_08_08_08_12_12_12_12_24_24_24_24_28_28_28_28:  ; AVX512VL:       # %bb.0: @@ -1431,12 +1590,18 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_2  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_04_24_24_24_24_28_28_28_28:  ; AVX512VL:       # %bb.0: @@ -1877,15 +2042,24 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT:    vpbroadcastw %xmm1, %xmm1 -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vpbroadcastw %xmm1, %xmm1 +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT:    vpbroadcastw %xmm1, %xmm1 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_08:  ; AVX512VL:       # %bb.0: @@ -1909,15 +2083,24 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_1  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT:    vpsllq $48, %xmm1, %xmm1 -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vpsllq $48, %xmm1, %xmm1 +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT:    vpsllq $48, %xmm1, %xmm1 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25] +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:  ; AVX512VL:       # %bb.0: @@ -1995,14 +2178,22 @@ define <16 x i16> @shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_1  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15] -; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,0,4,5,6,7,11,9,10,8,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,2,3,4,5,0,1,12,13,14,15,8,9,10,11,22,23,18,19,20,21,16,17,28,29,30,31,24,25,26,27] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13:  ; AVX512VL:       # %bb.0: @@ -2081,14 +2272,22 @@ define <16 x i16> @shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_1  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255> -; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[2,3,0,2,4,5,6,7,10,11,8,10,12,13,14,15] -; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255> +; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[2,3,0,2,4,5,6,7,10,11,8,10,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,255,255,255,255,255,255,0,0,255,255,255,255,255,255,u,u,255,255,255,255,255,255,255,255,255,255,255,255> +; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7,0,1,4,5,12,13,14,15,8,9,10,11,20,21,22,23,16,17,20,21,28,29,30,31,24,25,26,27] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13:  ; AVX512VL:       # %bb.0: @@ -2110,14 +2309,22 @@ define <16 x i16> @shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_1  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] -; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] +; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[4,5,6,7,0,1,2,3,12,13,14,15,8,9,14,15,20,21,22,23,16,17,18,19,28,29,30,31,24,25,30,31] +; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_15_10_11_08_09_14_15_12_15:  ; AVX512VL:       # %bb.0: @@ -2396,12 +2603,19 @@ define <16 x i16> @shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_1  ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,2,3,4,5,6,7] +; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[12,13,12,13,14,15,10,11,2,3,12,13,8,9,6,7,28,29,28,29,30,31,26,27,18,19,28,29,24,25,22,23] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_06_06_07_05_01_06_04_11_14_14_15_13_09_14_12_11:  ; AVX512VL:       # %bb.0: @@ -2540,14 +2754,22 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_1  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,0,4,5,6,7,8,10,10,8,12,13,14,15] -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,0,4,5,6,7,8,10,10,8,12,13,14,15] +; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,8,9,8,9,0,1,8,9,10,11,12,13,14,15,16,17,24,25,24,25,16,17,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:  ; AVX512VL:       # %bb.0: @@ -2655,17 +2877,10 @@ define <16 x i16> @shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_u  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15,16,17,18,19,20,21,30,31,20,21,30,31,28,29,30,31] -; AVX2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] -; AVX512VL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,7,6,7,8,9,10,11,12,15,14,15] -; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX512VL-NEXT:    retq +; AVX2OR512VL-LABEL: shuffle_v16i16_00_01_02_07_uu_uu_uu_uu_08_09_10_15_uu_uu_uu_uu: +; AVX2OR512VL:       # %bb.0: +; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,14,15,4,5,14,15,12,13,14,15,16,17,18,19,20,21,30,31,20,21,30,31,28,29,30,31] +; AVX2OR512VL-NEXT:    retq    %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 9, i32 10, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>    ret <16 x i16> %shuffle  } @@ -2705,17 +2920,10 @@ define <16 x i16> @shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_u  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3,24,25,26,27,28,29,22,23,24,25,26,27,16,17,18,19] -; AVX2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[3,1,2,0,7,5,6,4] -; AVX512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,3,4,5,6,7,8,11,10,11,12,13,14,15] -; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] -; AVX512VL-NEXT:    retq +; AVX2OR512VL-LABEL: shuffle_v16i16_04_05_06_03_uu_uu_uu_uu_12_13_14_11_uu_uu_uu_uu: +; AVX2OR512VL:       # %bb.0: +; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,6,7,8,9,10,11,0,1,2,3,24,25,26,27,28,29,22,23,24,25,26,27,16,17,18,19] +; AVX2OR512VL-NEXT:    retq    %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 5, i32 6, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>    ret <16 x i16> %shuffle  } @@ -2803,12 +3011,19 @@ define <16 x i16> @shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_1  ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] -; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,6,3,4,5,6,7] +; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,7,14,15,2,3,0,1,4,5,14,15,6,7,10,11,22,23,30,31,18,19,16,17,20,21,30,31,22,23,26,27] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_03_07_01_00_02_07_03_13_11_15_09_08_10_15_11_13:  ; AVX512VL:       # %bb.0: @@ -2957,16 +3172,25 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3  ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX2-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] -; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] +; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,2,3,8,9,12,13,12,13,14,15,16,17,16,17,20,21,18,19,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,2,3,6,7,12,13,10,11,14,15,14,15,16,17,18,19,18,19,22,23,28,29,26,27,30,31,30,31] +; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:  ; AVX512VL:       # %bb.0: @@ -2992,16 +3216,26 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2  ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u> -; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19] -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] -; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u> +; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,5,7,7,8,9,10,11,14,13,15,15] +; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u,255,255,255,255,u,u,u,u> +; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[8,9,8,9,4,5,10,11,0,1,0,1,12,13,2,3,24,25,24,25,20,21,26,27,16,17,16,17,28,29,18,19] +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,2,3,6,7,12,13,10,11,14,15,14,15,16,17,18,19,18,19,22,23,28,29,26,27,30,31,30,31] +; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:  ; AVX512VL:       # %bb.0: @@ -3123,15 +3357,24 @@ define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_3  ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] -; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15] -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,1,3,4,5,6,7,8,10,9,11,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,5,7,8,9,10,11,12,14,13,15] +; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15,16,17,20,21,18,19,22,23,24,25,28,29,26,27,30,31] +; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:  ; AVX512VL:       # %bb.0: @@ -3156,12 +3399,18 @@ define <16 x i16> @shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_u  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] -; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,2,3,6,5,6,7] -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,2,4,5,6,7,8,8,11,10,12,13,14,15] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[2,1,2,3,6,5,6,7] +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,3,2,4,5,6,7,8,8,11,10,12,13,14,15] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15] +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,8,9,6,7,4,5,8,9,10,11,12,13,14,15,24,25,24,25,22,23,20,21,24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:  ; AVX512VL:       # %bb.0: @@ -3363,14 +3612,22 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2  ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1 -; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1 -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15] -; AVX2-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15] -; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT:    vpbroadcastd %xmm1, %ymm1 +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,7,8,9,10,9,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,7,7,8,9,10,11,12,13,15,15] +; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15,16,17,18,19,20,21,18,19,24,25,26,27,30,31,30,31] +; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-NEXT:    vpbroadcastd %xmm1, %ymm1 +; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:  ; AVX512VL:       # %bb.0: @@ -3783,12 +4040,31 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a,  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX2OR512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2OR512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; AVX2-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; AVX512VL-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>    ret <16 x i16> %shuffle  } @@ -3821,11 +4097,33 @@ define <16 x i16> @shuffle_v16i16_4_20_5_21_6_22_7_23_u_u_u_u_u_u_u_u(<16 x i16>  }  define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { -; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: -; ALL:       # %bb.0: -; ALL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; ALL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; ALL-NEXT:    retq +; AVX1-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: +; AVX1:       # %bb.0: +; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>    ret <16 x i16> %shuffle  } @@ -3838,12 +4136,31 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a,  ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0 -; AVX2OR512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX2OR512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; AVX512VL-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>    ret <16 x i16> %shuffle  } @@ -3906,9 +4223,8 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2  ;  ; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25:  ; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,18,3,19,0,16,1,17,10,26,11,27,8,24,9,25] -; AVX512VL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3] +; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,18,3,19,10,26,11,27,0,16,1,17,8,24,9,25] +; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0  ; AVX512VL-NEXT:    retq    %1 = shufflevector <16 x i16> %a0, <16 x i16> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 0, i32 16, i32 1, i32 17, i32 10, i32 26, i32 11, i32 27, i32 8, i32 24, i32 9, i32 25>    %2 = bitcast <16 x i16> %1 to <4 x i64> @@ -4006,21 +4322,36 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) {  ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: PR24935: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17] -; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23] -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0> -; AVX2-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15] -; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] -; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15] -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] -; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: PR24935: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17] +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23] +; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0> +; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,5,6,7,8,9,10,11,13,13,14,15] +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] +; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15] +; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: PR24935: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm1[8,9,10,11,4,5,8,9,0,1,14,15,12,13,0,1,24,25,26,27,20,21,24,25,16,17,30,31,28,29,16,17] +; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1] +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,4,5,0,1,10,11,4,5,10,11,4,5,6,7,22,23,20,21,16,17,26,27,20,21,26,27,20,21,22,23] +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,255,255,255,255,0,0,u,u,0,0,u,u,u,u,255,255,0,0,u,u,u,u,u,u,0,0> +; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[0,1,2,3,2,3,6,7,10,11,10,11,12,13,14,15,16,17,18,19,18,19,22,23,26,27,26,27,28,29,30,31] +; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,6,7,u,u,18,19,u,u,u,u,u,u,u,u,24,25,16,17,u,u] +; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5,6,7,8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13,14,15] +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,0,0,0,0,255,255,255,255,0,0,0,0,0,0,255,255] +; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: PR24935:  ; AVX512VL:       # %bb.0: diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll index 01c7fc466eb8..51ef3a18438f 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1,7 +1,11 @@  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLBW --check-prefix=AVX512VLBW-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLBW --check-prefix=AVX512VLBW-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512vbmi | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLVBMI --check-prefix=AVX512VLVBMI-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VLVBMI --check-prefix=AVX512VLVBMI-FAST  define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i8> %a, <32 x i8> %b) {  ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: @@ -312,17 +316,23 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_  ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT:    vpshufb %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT:    vpbroadcastb %xmm0, %xmm0 -; AVX512VL-NEXT:    movl $32767, %eax # imm = 0x7FFF -; AVX512VL-NEXT:    kmovd %eax, %k1 -; AVX512VL-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0 -; AVX512VL-NEXT:    retq +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLBW-NEXT:    vpshufb %ymm2, %ymm1, %ymm1 +; AVX512VLBW-NEXT:    vpbroadcastb %xmm0, %xmm0 +; AVX512VLBW-NEXT:    movl $32767, %eax # imm = 0x7FFF +; AVX512VLBW-NEXT:    kmovd %eax, %k1 +; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0 +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>    ret <32 x i8> %shuffle  } @@ -346,14 +356,20 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_  ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT:    movl $1, %eax -; AVX512VL-NEXT:    kmovd %eax, %k1 -; AVX512VL-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT:    retq +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-NEXT:    movl $1, %eax +; AVX512VLBW-NEXT:    kmovd %eax, %k1 +; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 17, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>    ret <32 x i8> %shuffle  } @@ -377,14 +393,20 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_  ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT:    movw $1, %ax -; AVX512VL-NEXT:    kmovd %eax, %k1 -; AVX512VL-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT:    retq +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-NEXT:    movw $1, %ax +; AVX512VLBW-NEXT:    kmovd %eax, %k1 +; AVX512VLBW-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1} +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 18, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>    ret <32 x i8> %shuffle  } @@ -408,14 +430,20 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_  ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT:    movw $1, %ax -; AVX512VL-NEXT:    kmovd %eax, %k1 -; AVX512VL-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT:    retq +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-NEXT:    movw $1, %ax +; AVX512VLBW-NEXT:    kmovd %eax, %k1 +; AVX512VLBW-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1} +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 19, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>    ret <32 x i8> %shuffle  } @@ -431,12 +459,39 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-FAST-NEXT:    retq +; +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW:       # %bb.0: +; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT:    retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST:       # %bb.0: +; AVX512VLBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX512VLBW-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0 +; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 20, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>    ret <32 x i8> %shuffle  } @@ -452,12 +507,39 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-FAST-NEXT:    retq +; +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW:       # %bb.0: +; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT:    retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST:       # %bb.0: +; AVX512VLBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX512VLBW-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0 +; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 21, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>    ret <32 x i8> %shuffle  } @@ -473,12 +555,39 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-FAST-NEXT:    retq +; +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW:       # %bb.0: +; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT:    retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST:       # %bb.0: +; AVX512VLBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX512VLBW-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0 +; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 22, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>    ret <32 x i8> %shuffle  } @@ -494,12 +603,39 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-FAST-NEXT:    retq +; +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW:       # %bb.0: +; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT:    retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST:       # %bb.0: +; AVX512VLBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,6,7,0,1,2,3] +; AVX512VLBW-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0 +; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 23, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>    ret <32 x i8> %shuffle  } @@ -522,12 +658,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_  ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT:    retq +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW:       # %bb.0: +; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT:    retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST:       # %bb.0: +; AVX512VLBW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] +; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>    ret <32 x i8> %shuffle  } @@ -550,12 +698,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_  ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT:    retq +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW:       # %bb.0: +; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT:    retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST:       # %bb.0: +; AVX512VLBW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] +; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 25, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>    ret <32 x i8> %shuffle  } @@ -578,12 +738,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_  ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT:    retq +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW:       # %bb.0: +; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT:    retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST:       # %bb.0: +; AVX512VLBW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] +; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 26, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>    ret <32 x i8> %shuffle  } @@ -606,12 +778,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_  ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT:    retq +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW:       # %bb.0: +; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT:    retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST:       # %bb.0: +; AVX512VLBW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] +; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 27, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>    ret <32 x i8> %shuffle  } @@ -634,12 +818,24 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_  ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT:    retq +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW:       # %bb.0: +; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT:    retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST:       # %bb.0: +; AVX512VLBW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] +; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 28, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>    ret <32 x i8> %shuffle  } @@ -662,12 +858,24 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_  ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT:    retq +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW:       # %bb.0: +; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT:    retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST:       # %bb.0: +; AVX512VLBW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] +; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 29, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>    ret <32 x i8> %shuffle  } @@ -690,12 +898,24 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_  ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT:    retq +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW:       # %bb.0: +; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-SLOW-NEXT:    retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST:       # %bb.0: +; AVX512VLBW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] +; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512VLBW-FAST-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 30, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>    ret <32 x i8> %shuffle  } @@ -722,14 +942,29 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_  ; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX512VL-NEXT:    movl $15, %eax -; AVX512VL-NEXT:    vmovd %eax, %xmm1 -; AVX512VL-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT:    retq +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-SLOW:       # %bb.0: +; AVX512VLBW-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VLBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] +; AVX512VLBW-SLOW-NEXT:    movl $15, %eax +; AVX512VLBW-SLOW-NEXT:    vmovd %eax, %xmm1 +; AVX512VLBW-SLOW-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 +; AVX512VLBW-SLOW-NEXT:    retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLBW-FAST:       # %bb.0: +; AVX512VLBW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,3,0,1] +; AVX512VLBW-FAST-NEXT:    movl $15, %eax +; AVX512VLBW-FAST-NEXT:    vmovd %eax, %xmm1 +; AVX512VLBW-FAST-NEXT:    vpshufb %ymm1, %ymm0, %ymm0 +; AVX512VLBW-FAST-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    movl $31, %eax +; AVX512VLVBMI-NEXT:    vmovd %eax, %xmm1 +; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>    ret <32 x i8> %shuffle  } @@ -1092,25 +1327,49 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA -; AVX512VL-NEXT:    kmovd %eax, %k1 -; AVX512VL-NEXT:    vpshufb %ymm2, %ymm1, %ymm0 {%k1} -; AVX512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    retq +; +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: +; AVX512VLBW-SLOW:       # %bb.0: +; AVX512VLBW-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX512VLBW-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VLBW-SLOW-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLBW-SLOW-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VLBW-SLOW-NEXT:    kmovd %eax, %k1 +; AVX512VLBW-SLOW-NEXT:    vpshufb %ymm2, %ymm1, %ymm0 {%k1} +; AVX512VLBW-SLOW-NEXT:    retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: +; AVX512VLBW-FAST:       # %bb.0: +; AVX512VLBW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX512VLBW-FAST-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLBW-FAST-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VLBW-FAST-NEXT:    kmovd %eax, %k1 +; AVX512VLBW-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm0 {%k1} +; AVX512VLBW-FAST-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,32,0,32,0,32,0,32,0,32,0,32,0,32,0,32,16,48,16,48,16,48,16,48,16,48,16,48,16,48,16,48] +; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 0, i32 32, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48, i32 16, i32 48>    ret <32 x i8> %shuffle  } @@ -1129,12 +1388,26 @@ define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_  ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; AVX2OR512VL-NEXT:    vpshufb %ymm2, %ymm1, %ymm1 -; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2OR512VL-NEXT:    retq +; AVX2-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT:    retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLBW-NEXT:    vpshufb %ymm2, %ymm1, %ymm1 +; AVX512VLBW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,40,41,42,43,44,45,46,47,16,16,16,16,16,16,16,16,56,57,58,59,60,61,62,63] +; AVX512VLVBMI-NEXT:    vpermi2b %ymm0, %ymm1, %ymm2 +; AVX512VLVBMI-NEXT:    vmovdqa %ymm2, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>    ret <32 x i8> %shuffle  } @@ -1155,11 +1428,24 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_  ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] -; AVX2OR512VL-NEXT:    retq +; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] +; AVX2-NEXT:    retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,47,46,45,44,43,42,41,40,23,22,21,20,19,18,17,16,63,62,61,60,59,58,57,56] +; AVX512VLVBMI-NEXT:    vpermi2b %ymm0, %ymm1, %ymm2 +; AVX512VLVBMI-NEXT:    vmovdqa %ymm2, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24>    ret <32 x i8> %shuffle  } @@ -1177,12 +1463,26 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_  ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16] -; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u] -; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2OR512VL-NEXT:    retq +; AVX2-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16] +; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u] +; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-NEXT:    retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16] +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[7,6,5,4,3,2,1,0,u,u,u,u,u,u,u,u,23,22,21,20,19,18,17,16,u,u,u,u,u,u,u,u] +; AVX512VLBW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_07_06_05_04_03_02_01_00_55_54_53_52_51_50_49_48_23_22_21_20_19_18_17_16: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,39,38,37,36,35,34,33,32,23,22,21,20,19,18,17,16,55,54,53,52,51,50,49,48] +; AVX512VLVBMI-NEXT:    vpermi2b %ymm0, %ymm1, %ymm2 +; AVX512VLVBMI-NEXT:    vmovdqa %ymm2, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16>    ret <32 x i8> %shuffle  } @@ -1350,13 +1650,19 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_  ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u] -; AVX512VL-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA -; AVX512VL-NEXT:    kmovd %eax, %k1 -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31] -; AVX512VL-NEXT:    retq +; AVX512VLBW-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u] +; AVX512VLBW-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VLBW-NEXT:    kmovd %eax, %k1 +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31] +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] +; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>    ret <32 x i8> %shuffle  } @@ -1379,13 +1685,19 @@ define <32 x i8> @shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_  ; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u] -; AVX512VL-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA -; AVX512VL-NEXT:    kmovd %eax, %k1 -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23] -; AVX512VL-NEXT:    retq +; AVX512VLBW-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u] +; AVX512VLBW-NEXT:    movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VLBW-NEXT:    kmovd %eax, %k1 +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23] +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47,16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55] +; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>    ret <32 x i8> %shuffle  } @@ -1634,22 +1946,29 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_  ; AVX2-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u] -; AVX512VL-NEXT:    movl $-222248896, %eax # imm = 0xF2C0C040 -; AVX512VL-NEXT:    kmovd %eax, %k1 -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23] -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] -; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7] -; AVX512VL-NEXT:    movl $134948620, %eax # imm = 0x80B270C -; AVX512VL-NEXT:    kmovd %eax, %k1 -; AVX512VL-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0 -; AVX512VL-NEXT:    retq +; AVX512VLBW-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u] +; AVX512VLBW-NEXT:    movl $-222248896, %eax # imm = 0xF2C0C040 +; AVX512VLBW-NEXT:    kmovd %eax, %k1 +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23] +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] +; AVX512VLBW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7] +; AVX512VLBW-NEXT:    movl $134948620, %eax # imm = 0x80B270C +; AVX512VLBW-NEXT:    kmovd %eax, %k1 +; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512VLBW-NEXT:    vmovdqa %ymm1, %ymm0 +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [10,13,44,45,3,3,28,8,49,54,61,12,1,44,16,19,52,51,20,51,17,22,5,0,16,10,27,39,4,2,4,7] +; AVX512VLVBMI-NEXT:    vpermi2b %ymm0, %ymm1, %ymm2 +; AVX512VLVBMI-NEXT:    vmovdqa %ymm2, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 42, i32 45, i32 12, i32 13, i32 35, i32 35, i32 60, i32 40, i32 17, i32 22, i32 29, i32 44, i32 33, i32 12, i32 48, i32 51, i32 20, i32 19, i32 52, i32 19, i32 49, i32 54, i32 37, i32 32, i32 48, i32 42, i32 59, i32 7, i32 36, i32 34, i32 36, i32 39>    ret <32 x i8> %shuffle  } @@ -1663,11 +1982,23 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT:    retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT:    retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,32,32,32,32,32,32,32,32,40,40,40,40,40,40,40,40] +; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>    ret <32 x i8> %shuffle  } @@ -1682,11 +2013,23 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] -; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT:    retq +; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT:    retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_32_32_32_32_32_32_32_32_40_40_40_40_40_40_40_40: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24,32,32,32,32,32,32,32,32,40,40,40,40,40,40,40,40] +; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40, i32 40>    ret <32 x i8> %shuffle  } @@ -1702,11 +2045,23 @@ define <32 x i8> @shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT:    retq +; AVX2-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT:    retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_16_16_16_16_16_16_16_16_24_24_24_24_24_24_24_24_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24,48,48,48,48,48,48,48,48,56,56,56,56,56,56,56,56] +; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>    ret <32 x i8> %shuffle  } @@ -1721,11 +2076,23 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] -; AVX2OR512VL-NEXT:    retq +; AVX2-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX2-NEXT:    retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,16,16,16,16,16,16,16,16,24,24,24,24,24,24,24,24] +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_48_48_48_48_48_48_48_48_56_56_56_56_56_56_56_56: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8,48,48,48,48,48,48,48,48,56,56,56,56,56,56,56,56] +; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 48, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>    ret <32 x i8> %shuffle  } @@ -1738,12 +2105,25 @@ define <32 x i8> @shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_  ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX2OR512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX2OR512VL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2OR512VL-NEXT:    retq +; AVX2-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT:    retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512VLBW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] +; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47>    ret <32 x i8> %shuffle  } @@ -1961,18 +2341,26 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz  ; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-NEXT:    movl $286331153, %eax # imm = 0x11111111 -; AVX512VL-NEXT:    kmovd %eax, %k1 -; AVX512VL-NEXT:    vmovdqu8 %ymm0, %ymm0 {%k1} {z} -; AVX512VL-NEXT:    retq +; AVX512VLBW-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vextracti128 $1, %ymm0, %xmm0 +; AVX512VLBW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512VLBW-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX512VLBW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VLBW-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512VLBW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VLBW-NEXT:    movl $286331153, %eax # imm = 0x11111111 +; AVX512VLBW-NEXT:    kmovd %eax, %k1 +; AVX512VLBW-NEXT:    vmovdqu8 %ymm0, %ymm0 {%k1} {z} +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [56,1,2,3,57,5,6,7,58,9,10,11,59,13,14,15,60,17,18,19,61,21,22,23,62,25,26,27,63,29,30,31] +; AVX512VLVBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; AVX512VLVBMI-NEXT:    vpermt2b %ymm0, %ymm2, %ymm1 +; AVX512VLVBMI-NEXT:    vmovdqa %ymm1, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 56, i32 1, i32 2, i32 3, i32 57, i32 5, i32 6, i32 7, i32 58, i32 9, i32 10, i32 11, i32 59, i32 13, i32 14, i32 15, i32 60, i32 17, i32 18, i32 19, i32 61, i32 21, i32 22, i32 23, i32 62, i32 25, i32 26, i32 27, i32 63, i32 29, i32 30, i32 31>    ret <32 x i8> %shuffle  } @@ -2159,19 +2547,11 @@ define <32 x i8> @shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] -; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT:    retq +; AVX2OR512VL-LABEL: shuffle_v32i8_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10_10: +; AVX2OR512VL:       # %bb.0: +; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX2OR512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2OR512VL-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>    ret <32 x i8> %shuffle  } @@ -2203,37 +2583,21 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_  ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15] -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpbroadcastb %xmm1, %xmm1 -; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT:    retq +; AVX2OR512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AVX2OR512VL:       # %bb.0: +; AVX2OR512VL-NEXT:    vpbroadcastb %xmm1, %xmm1 +; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,12,12,13,13,14,14,15,15] +; AVX2OR512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2OR512VL-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>    ret <32 x i8> %shuffle  }  define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu(<32 x i8> %a, <32 x i8> %b) { -; AVX1OR2-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: -; AVX1OR2:       # %bb.0: -; AVX1OR2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1OR2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512VL-NEXT:    retq +; ALL-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; ALL:       # %bb.0: +; ALL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; ALL-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>    ret <32 x i8> %shuffle  } @@ -2245,19 +2609,11 @@ define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_  ; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] -; AVX2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX512VL-NEXT:    retq +; AVX2OR512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: +; AVX2OR512VL:       # %bb.0: +; AVX2OR512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0 +; AVX2OR512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2OR512VL-NEXT:    retq    %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 22, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>    ret <32 x i8> %shuffle  } @@ -2276,13 +2632,27 @@ define <32 x i8> @shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_2  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0 -; AVX2OR512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1 -; AVX2OR512VL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 -; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2OR512VL-NEXT:    retq +; AVX2-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0 +; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1 +; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT:    retq +; +; AVX512VLBW-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vpsrlw $8, %ymm0, %ymm0 +; AVX512VLBW-NEXT:    vpsrlw $8, %ymm1, %ymm1 +; AVX512VLBW-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 +; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-LABEL: shuffe_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: +; AVX512VLVBMI:       # %bb.0: +; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] +; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT:    retq    %1 = lshr <16 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>    %2 = lshr <16 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>    %3 = bitcast <16 x i16> %1 to <32 x i8> @@ -2310,11 +2680,29 @@ define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) {  ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: PR28136: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2OR512VL-NEXT:    retq +; AVX2-LABEL: PR28136: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT:    retq +; +; AVX512VLBW-LABEL: PR28136: +; AVX512VLBW:       # %bb.0: +; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VLBW-NEXT:    retq +; +; AVX512VLVBMI-SLOW-LABEL: PR28136: +; AVX512VLVBMI-SLOW:       # %bb.0: +; AVX512VLVBMI-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512VLVBMI-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VLVBMI-SLOW-NEXT:    retq +; +; AVX512VLVBMI-FAST-LABEL: PR28136: +; AVX512VLVBMI-FAST:       # %bb.0: +; AVX512VLVBMI-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,16,48,17,49,18,50,19,51,4,36,5,37,6,38,7,39,20,52,21,53,22,54,23,55] +; AVX512VLVBMI-FAST-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-FAST-NEXT:    retq    %1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50,i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>    %2 = bitcast <32 x i8> %1 to <4 x i64>    %3 = shufflevector <4 x i64> %2, <4 x i64> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3> diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll index 41dcb5032ee2..ebef762787d9 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1,7 +1,9 @@  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST  define <4 x double> @shuffle_v4f64_0000(<4 x double> %a, <4 x double> %b) {  ; AVX1-LABEL: shuffle_v4f64_0000: @@ -546,19 +548,29 @@ define <4 x double> @shuffle_v4f64_0z3z(<4 x double> %a, <4 x double> %b) {  ; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v4f64_0z3z: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] -; AVX2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; AVX2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v4f64_0z3z: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] -; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] -; AVX512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v4f64_0z3z: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] +; AVX2-SLOW-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v4f64_0z3z: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v4f64_0z3z: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2] +; AVX512VL-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v4f64_0z3z: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 0, i32 4, i32 3, i32 4>    ret <4 x double> %shuffle  } @@ -574,19 +586,29 @@ define <4 x double> @shuffle_v4f64_1z2z(<4 x double> %a, <4 x double> %b) {  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v4f64_1z2z: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] -; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] -; AVX2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v4f64_1z2z: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] -; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] -; AVX512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v4f64_1z2z: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v4f64_1z2z: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v4f64_1z2z: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v4f64_1z2z: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-FAST-NEXT:    retq    %1 = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4>    ret <4 x double> %1  } @@ -812,11 +834,17 @@ define <4 x i64> @shuffle_v4i64_0124(<4 x i64> %a, <4 x i64> %b) {  ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v4i64_0124: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vbroadcastsd %xmm1, %ymm1 -; AVX512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX512VL-NEXT:    retq +; AVX512VL-SLOW-LABEL: shuffle_v4i64_0124: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vbroadcastsd %xmm1, %ymm1 +; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v4i64_0124: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,4] +; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4>    ret <4 x i64> %shuffle  } @@ -863,12 +891,19 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) {  ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v4i64_0412: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpbroadcastq %xmm1, %xmm1 -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] -; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX512VL-NEXT:    retq +; AVX512VL-SLOW-LABEL: shuffle_v4i64_0412: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpbroadcastq %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2] +; AVX512VL-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v4i64_0412: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpbroadcastq %xmm1, %xmm1 +; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2] +; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2>    ret <4 x i64> %shuffle  } @@ -889,11 +924,17 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {  ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v4i64_4012: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2] -; AVX512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512VL-NEXT:    retq +; AVX512VL-SLOW-LABEL: shuffle_v4i64_4012: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,2] +; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v4i64_4012: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,0,1,2] +; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2>    ret <4 x i64> %shuffle  } @@ -924,9 +965,8 @@ define <4 x i64> @shuffle_v4i64_0451(<4 x i64> %a, <4 x i64> %b) {  ;  ; AVX512VL-LABEL: shuffle_v4i64_0451:  ; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,3] -; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] -; AVX512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7] +; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,4,5,1] +; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0  ; AVX512VL-NEXT:    retq    %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1>    ret <4 x i64> %shuffle @@ -958,9 +998,8 @@ define <4 x i64> @shuffle_v4i64_4015(<4 x i64> %a, <4 x i64> %b) {  ;  ; AVX512VL-LABEL: shuffle_v4i64_4015:  ; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] -; AVX512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,0,1,5] +; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0  ; AVX512VL-NEXT:    retq    %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5>    ret <4 x i64> %shuffle @@ -980,11 +1019,17 @@ define <4 x i64> @shuffle_v4i64_2u35(<4 x i64> %a, <4 x i64> %b) {  ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1]  ; AVX2-NEXT:    retq  ; -; AVX512VL-LABEL: shuffle_v4i64_2u35: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1] -; AVX512VL-NEXT:    retq +; AVX512VL-SLOW-LABEL: shuffle_v4i64_2u35: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] +; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,1] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v4i64_2u35: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,5,3,5] +; AVX512VL-FAST-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 undef, i32 3, i32 5>    ret <4 x i64> %shuffle  } @@ -1008,9 +1053,8 @@ define <4 x i64> @shuffle_v4i64_1251(<4 x i64> %a, <4 x i64> %b) {  ;  ; AVX512VL-LABEL: shuffle_v4i64_1251:  ; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,2,1] -; AVX512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] +; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,5,1] +; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0  ; AVX512VL-NEXT:    retq    %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 5, i32 1>    ret <4 x i64> %shuffle @@ -1121,9 +1165,8 @@ define <4 x i64> @shuffle_v4i64_0415(<4 x i64> %a, <4 x i64> %b) {  ;  ; AVX512VL-LABEL: shuffle_v4i64_0415:  ; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,4,1,5] +; AVX512VL-NEXT:    vpermt2q %ymm1, %ymm2, %ymm0  ; AVX512VL-NEXT:    retq    %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>    ret <4 x i64> %shuffle @@ -1564,19 +1607,29 @@ define <4 x i64> @shuffle_v4i64_z0z3(<4 x i64> %a, <4 x i64> %b) {  ; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v4i64_z0z3: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3] -; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v4i64_z0z3: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3] -; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] -; AVX512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v4i64_z0z3: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3] +; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v4i64_z0z3: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31] +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v4i64_z0z3: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3] +; AVX512VL-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v4i64_z0z3: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25,26,27,28,29,30,31] +; AVX512VL-FAST-NEXT:    retq    %1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 4, i32 0, i32 4, i32 3>    ret <4 x i64> %1  } @@ -1592,19 +1645,29 @@ define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) {  ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v4i64_1z2z: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1 -; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] -; AVX2-NEXT:    retq -; -; AVX512VL-LABEL: shuffle_v4i64_1z2z: -; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,2,0] -; AVX512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v4i64_1z2z: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vxorps %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v4i64_1z2z: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v4i64_1z2z: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,2,0] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v4i64_1z2z: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-FAST-NEXT:    retq    %1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4>    ret <4 x i64> %1  } diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll index 44d0217f5295..1b6df7dd2d25 100644 --- a/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1,7 +1,9 @@  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 --check-prefix=AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=ALL --check-prefix=AVX2OR512VL --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST  define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) {  ; AVX1-LABEL: shuffle_v8f32_00000000: @@ -342,12 +344,26 @@ define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) {  ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v8f32_09ab1def: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2OR512VL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2OR512VL-NEXT:    retq +; AVX2-LABEL: shuffle_v8f32_09ab1def: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8f32_09ab1def: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512VL-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v8f32_09ab1def: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] +; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm0 = [8,1,2,3,10,5,6,7] +; AVX512VL-FAST-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>    ret <8 x float> %shuffle  } @@ -651,14 +667,23 @@ define <8 x float> @shuffle_v8f32_c348cda0(<8 x float> %a, <8 x float> %b) {  ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7]  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v8f32_c348cda0: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u> -; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1 -; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] -; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] -; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v8f32_c348cda0: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u> +; AVX2-SLOW-NEXT:    vpermps %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] +; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] +; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8f32_c348cda0: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [0,3,4,7,4,7,2,0] +; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u> +; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v8f32_c348cda0:  ; AVX512VL:       # %bb.0: @@ -681,14 +706,23 @@ define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {  ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v8f32_f511235a: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7] -; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0] -; AVX2-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7] -; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2] -; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v8f32_f511235a: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7] +; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0] +; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2] +; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8f32_f511235a: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [7,6,2,3,7,6,3,2] +; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [5,5,1,1,2,3,5,5] +; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v8f32_f511235a:  ; AVX512VL:       # %bb.0: @@ -722,11 +756,29 @@ define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) {  ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v8f32_76547654: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v8f32_76547654: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8f32_76547654: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8f32_76547654: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v8f32_76547654: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX512VL-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>    ret <8 x float> %shuffle  } @@ -738,11 +790,29 @@ define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {  ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v8f32_76543210: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v8f32_76543210: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8f32_76543210: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8f32_76543210: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v8f32_76543210: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX512VL-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>    ret <8 x float> %shuffle  } @@ -798,11 +868,23 @@ define <8 x float> @PR21138(<8 x float> %truc, <8 x float> %tchose) {  ; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: PR21138: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2OR512VL-NEXT:    retq +; AVX2-LABEL: PR21138: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: PR21138: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: PR21138: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] +; AVX512VL-FAST-NEXT:    vpermt2ps %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <8 x float> %truc, <8 x float> %tchose, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>    ret <8 x float> %shuffle  } @@ -1264,12 +1346,26 @@ define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) {  ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v8i32_09ab1def: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2OR512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2OR512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2OR512VL-NEXT:    retq +; AVX2-LABEL: shuffle_v8i32_09ab1def: +; AVX2:       # %bb.0: +; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_09ab1def: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX512VL-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] +; AVX512VL-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_09ab1def: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX512VL-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [8,1,2,3,10,5,6,7] +; AVX512VL-FAST-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>    ret <8 x i32> %shuffle  } @@ -1696,13 +1792,21 @@ define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) {  ; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]  ; AVX1-NEXT:    retq  ; -; AVX2-LABEL: shuffle_v8i32_6caa87e5: -; AVX2:       # %bb.0: -; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] -; AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] -; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3] -; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] -; AVX2-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v8i32_6caa87e5: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] +; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] +; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3] +; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8i32_6caa87e5: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [4,4,2,2,0,0,6,6] +; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] +; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] +; AVX2-FAST-NEXT:    retq  ;  ; AVX512VL-LABEL: shuffle_v8i32_6caa87e5:  ; AVX512VL:       # %bb.0: @@ -1737,11 +1841,29 @@ define <8 x i32> @shuffle_v8i32_76547654(<8 x i32> %a, <8 x i32> %b) {  ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v8i32_76547654: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v8i32_76547654: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8i32_76547654: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_76547654: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_76547654: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX512VL-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>    ret <8 x i32> %shuffle  } @@ -1753,11 +1875,29 @@ define <8 x i32> @shuffle_v8i32_76543210(<8 x i32> %a, <8 x i32> %b) {  ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]  ; AVX1-NEXT:    retq  ; -; AVX2OR512VL-LABEL: shuffle_v8i32_76543210: -; AVX2OR512VL:       # %bb.0: -; AVX2OR512VL-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2OR512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2OR512VL-NEXT:    retq +; AVX2-SLOW-LABEL: shuffle_v8i32_76543210: +; AVX2-SLOW:       # %bb.0: +; AVX2-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuffle_v8i32_76543210: +; AVX2-FAST:       # %bb.0: +; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT:    retq +; +; AVX512VL-SLOW-LABEL: shuffle_v8i32_76543210: +; AVX512VL-SLOW:       # %bb.0: +; AVX512VL-SLOW-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] +; AVX512VL-SLOW-NEXT:    retq +; +; AVX512VL-FAST-LABEL: shuffle_v8i32_76543210: +; AVX512VL-FAST:       # %bb.0: +; AVX512VL-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [7,6,5,4,3,2,1,0] +; AVX512VL-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0 +; AVX512VL-FAST-NEXT:    retq    %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>    ret <8 x i32> %shuffle  } diff --git a/test/CodeGen/X86/vector-shuffle-512-v32.ll b/test/CodeGen/X86/vector-shuffle-512-v32.ll index 3e49957bf85e..d4fb0fd52a79 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -186,8 +186,7 @@ define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19  ;  ; SKX-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28:  ; SKX:       ## %bb.0: -; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31] -; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28] +; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[2,3,2,3,0,1,0,1,10,11,10,11,8,9,8,9,18,19,18,19,16,17,16,17,26,27,26,27,24,25,24,25,34,35,34,35,32,33,32,33,42,43,42,43,40,41,40,41,50,51,50,51,48,49,48,49,58,59,58,59,56,57,56,57]  ; SKX-NEXT:    retq    %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 5, i32 5, i32 4, i32 4, i32 9, i32 9, i32 8, i32 8, i32 13, i32 13, i32 12, i32 12, i32 17, i32 17, i32 16, i32 16, i32 21, i32 21, i32 20, i32 20, i32 25, i32 25, i32 24, i32 24, i32 29, i32 29, i32 28, i32 28>    ret <32 x i16> %c @@ -354,3 +353,18 @@ define <8 x i16> @pr32967(<32 x i16> %v) {   %shuffle = shufflevector <32 x i16> %v, <32 x i16> undef, <8 x i32> <i32 1,i32 5,i32 9,i32 13,i32 17,i32 21,i32 25,i32 29>   ret <8 x i16> %shuffle  } + +define <32 x i16> @shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz(<32 x i16> %a) { +; KNL-LABEL: shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz: +; KNL:       ## %bb.0: +; KNL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[14,15],zero,zero,ymm0[10,11],zero,zero,ymm0[6,7],zero,zero,ymm0[2,3],zero,zero,ymm0[30,31],zero,zero,ymm0[26,27],zero,zero,ymm0[22,23],zero,zero,ymm0[18,19],zero,zero +; KNL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[14,15],zero,zero,ymm1[10,11],zero,zero,ymm1[6,7],zero,zero,ymm1[2,3],zero,zero,ymm1[30,31],zero,zero,ymm1[26,27],zero,zero,ymm1[22,23],zero,zero,ymm1[20,21],zero,zero +; KNL-NEXT:    retq +; +; SKX-LABEL: shuffle_v32i16_07_zz_05_zz_03_zz_01_zz_15_zz_13_zz_11_zz_09_zz_23_zz_21_zz_19_zz_17_zz_31_zz_29_zz_27_zz_25_zz: +; SKX:       ## %bb.0: +; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[14,15],zero,zero,zmm0[10,11],zero,zero,zmm0[6,7],zero,zero,zmm0[2,3],zero,zero,zmm0[30,31],zero,zero,zmm0[26,27],zero,zero,zmm0[22,23],zero,zero,zmm0[18,19],zero,zero,zmm0[46,47],zero,zero,zmm0[42,43],zero,zero,zmm0[38,39],zero,zero,zmm0[34,35],zero,zero,zmm0[62,63],zero,zero,zmm0[58,59],zero,zero,zmm0[54,55],zero,zero,zmm0[52,53],zero,zero +; SKX-NEXT:    retq +  %shuffle = shufflevector <32 x i16> zeroinitializer, <32 x i16> %a, <32 x i32> <i32 39, i32 0, i32 37, i32 0, i32 35, i32 0, i32 33, i32 0, i32 47, i32 0, i32 45, i32 0, i32 43, i32 0, i32 41, i32 0, i32 55, i32 0, i32 53, i32 0, i32 51, i32 0, i32 49, i32 0, i32 63, i32 0, i32 61, i32 0, i32 59, i32 0, i32 58, i32 0> +  ret <32 x i16> %shuffle +} diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll index 9c92ca756ebd..f3433ce834cd 100644 --- a/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/test/CodeGen/X86/vector-shuffle-v1.ll @@ -48,9 +48,8 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {  ; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k1  ; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0  ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1} {z} -; AVX512VL-NEXT:    movb $1, %al -; AVX512VL-NEXT:    kmovw %eax, %k1 -; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm2 {%k1} {z} +; AVX512VL-NEXT:    movq $-1, %rax +; AVX512VL-NEXT:    vmovq %rax, %xmm2  ; AVX512VL-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]  ; AVX512VL-NEXT:    vpsllq $63, %xmm1, %xmm1  ; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1 @@ -61,9 +60,8 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {  ; VL_BW_DQ:       # %bb.0:  ; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0  ; VL_BW_DQ-NEXT:    vptestmq %xmm0, %xmm0, %k0 -; VL_BW_DQ-NEXT:    movb $1, %al -; VL_BW_DQ-NEXT:    kmovd %eax, %k1 -; VL_BW_DQ-NEXT:    vpmovm2q %k1, %xmm0 +; VL_BW_DQ-NEXT:    movq $-1, %rax +; VL_BW_DQ-NEXT:    vmovq %rax, %xmm0  ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm1  ; VL_BW_DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]  ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0 @@ -123,12 +121,12 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %  ; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0:  ; AVX512VL:       # %bb.0:  ; AVX512VL-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 -; AVX512VL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] -; AVX512VL-NEXT:    vpermq %zmm0, %zmm1, %zmm0 -; AVX512VL-NEXT:    vpsllq $63, %zmm0, %zmm0 -; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1  ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,1,0,3,7,7,0] +; AVX512VL-NEXT:    vpermd %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT:    vpslld $31, %ymm1, %ymm1 +; AVX512VL-NEXT:    vptestmd %ymm1, %ymm1, %k1  ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}  ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0  ; AVX512VL-NEXT:    vzeroupper @@ -137,10 +135,10 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %  ; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:  ; VL_BW_DQ:       # %bb.0:  ; VL_BW_DQ-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0 -; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0 -; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] -; VL_BW_DQ-NEXT:    vpermq %zmm0, %zmm1, %zmm0 -; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0 +; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,6,1,0,3,7,7,0] +; VL_BW_DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0 +; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0  ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0  ; VL_BW_DQ-NEXT:    vzeroupper  ; VL_BW_DQ-NEXT:    retq @@ -250,12 +248,12 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {  ; AVX512VL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:  ; AVX512VL:       # %bb.0:  ; AVX512VL-NEXT:    kmovw %edi, %k1 -; AVX512VL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT:    vpbroadcastq %xmm0, %zmm0 -; AVX512VL-NEXT:    vpsllq $63, %zmm0, %zmm0 -; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1  ; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512VL-NEXT:    vpbroadcastq %xmm1, %ymm1 +; AVX512VL-NEXT:    vpslld $31, %ymm1, %ymm1 +; AVX512VL-NEXT:    vptestmd %ymm1, %ymm1, %k1  ; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}  ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0  ; AVX512VL-NEXT:    vzeroupper @@ -264,10 +262,10 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {  ; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:  ; VL_BW_DQ:       # %bb.0:  ; VL_BW_DQ-NEXT:    kmovd %edi, %k0 -; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0 -; VL_BW_DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0 -; VL_BW_DQ-NEXT:    vpbroadcastq %xmm0, %zmm0 -; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0 +; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; VL_BW_DQ-NEXT:    vpbroadcastq %xmm0, %ymm0 +; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0  ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0  ; VL_BW_DQ-NEXT:    vzeroupper  ; VL_BW_DQ-NEXT:    retq @@ -294,12 +292,14 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {  ; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u:  ; AVX512VL:       # %bb.0:  ; AVX512VL-NEXT:    kmovw %edi, %k1 -; AVX512VL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]  ; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> -; AVX512VL-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512VL-NEXT:    vpsllq $63, %zmm2, %zmm0 -; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k0 +; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7] +; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0  ; AVX512VL-NEXT:    kmovw %k0, %eax  ; AVX512VL-NEXT:    # kill: def %al killed %al killed %eax  ; AVX512VL-NEXT:    vzeroupper @@ -308,11 +308,12 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {  ; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:  ; VL_BW_DQ:       # %bb.0:  ; VL_BW_DQ-NEXT:    kmovd %edi, %k0 -; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0 +; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0 +; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] +; VL_BW_DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]  ; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1 -; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> -; VL_BW_DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2 -; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0 +; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7] +; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0  ; VL_BW_DQ-NEXT:    kmovd %k0, %eax  ; VL_BW_DQ-NEXT:    # kill: def %al killed %al killed %eax  ; VL_BW_DQ-NEXT:    vzeroupper @@ -339,10 +340,11 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {  ; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u:  ; AVX512VL:       # %bb.0:  ; AVX512VL-NEXT:    kmovw %edi, %k1 -; AVX512VL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512VL-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] -; AVX512VL-NEXT:    vpsllq $63, %zmm0, %zmm0 -; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k0 +; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0  ; AVX512VL-NEXT:    kmovw %k0, %eax  ; AVX512VL-NEXT:    # kill: def %al killed %al killed %eax  ; AVX512VL-NEXT:    vzeroupper @@ -351,9 +353,9 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {  ; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:  ; VL_BW_DQ:       # %bb.0:  ; VL_BW_DQ-NEXT:    kmovd %edi, %k0 -; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0 -; VL_BW_DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] -; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0 +; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0 +; VL_BW_DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0  ; VL_BW_DQ-NEXT:    kmovd %k0, %eax  ; VL_BW_DQ-NEXT:    # kill: def %al killed %al killed %eax  ; VL_BW_DQ-NEXT:    vzeroupper @@ -382,12 +384,13 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {  ; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0:  ; AVX512VL:       # %bb.0:  ; AVX512VL-NEXT:    kmovw %edi, %k1 -; AVX512VL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}  ; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] -; AVX512VL-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512VL-NEXT:    vpsllq $63, %zmm2, %zmm0 -; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k0 +; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] +; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT:    vpslld $31, %ymm2, %ymm0 +; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0  ; AVX512VL-NEXT:    kmovw %k0, %eax  ; AVX512VL-NEXT:    # kill: def %al killed %al killed %eax  ; AVX512VL-NEXT:    vzeroupper @@ -396,11 +399,11 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {  ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:  ; VL_BW_DQ:       # %bb.0:  ; VL_BW_DQ-NEXT:    kmovd %edi, %k0 -; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0 +; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0  ; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1 -; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] -; VL_BW_DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2 -; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0 +; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] +; VL_BW_DQ-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2 +; VL_BW_DQ-NEXT:    vpmovd2m %ymm2, %k0  ; VL_BW_DQ-NEXT:    kmovd %k0, %eax  ; VL_BW_DQ-NEXT:    # kill: def %al killed %al killed %eax  ; VL_BW_DQ-NEXT:    vzeroupper @@ -429,12 +432,13 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {  ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0:  ; AVX512VL:       # %bb.0:  ; AVX512VL-NEXT:    kmovw %edi, %k1 -; AVX512VL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] -; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512VL-NEXT:    vpsllq $63, %zmm2, %zmm0 -; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k0 +; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7] +; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0  ; AVX512VL-NEXT:    kmovw %k0, %eax  ; AVX512VL-NEXT:    # kill: def %al killed %al killed %eax  ; AVX512VL-NEXT:    vzeroupper @@ -443,11 +447,11 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {  ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:  ; VL_BW_DQ:       # %bb.0:  ; VL_BW_DQ-NEXT:    kmovd %edi, %k0 -; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0 -; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] -; VL_BW_DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2 -; VL_BW_DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2 -; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0 +; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0 +; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2] +; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1 +; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4,5,6,7] +; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0  ; VL_BW_DQ-NEXT:    kmovd %k0, %eax  ; VL_BW_DQ-NEXT:    # kill: def %al killed %al killed %eax  ; VL_BW_DQ-NEXT:    vzeroupper @@ -462,12 +466,10 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {  ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:  ; AVX512F:       # %bb.0:  ; AVX512F-NEXT:    kmovw %edi, %k1 -; AVX512F-NEXT:    movb $51, %al -; AVX512F-NEXT:    kmovw %eax, %k2 -; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] -; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1] +; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,18446744073709551615,18446744073709551615,0,0] +; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2  ; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0  ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0  ; AVX512F-NEXT:    kmovw %k0, %eax @@ -478,14 +480,12 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {  ; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1:  ; AVX512VL:       # %bb.0:  ; AVX512VL-NEXT:    kmovw %edi, %k1 -; AVX512VL-NEXT:    movb $51, %al -; AVX512VL-NEXT:    kmovw %eax, %k2 -; AVX512VL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512VL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] -; AVX512VL-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512VL-NEXT:    vpsllq $63, %zmm2, %zmm0 -; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k0 +; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7] +; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0  ; AVX512VL-NEXT:    kmovw %k0, %eax  ; AVX512VL-NEXT:    # kill: def %al killed %al killed %eax  ; AVX512VL-NEXT:    vzeroupper @@ -494,11 +494,10 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {  ; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:  ; VL_BW_DQ:       # %bb.0:  ; VL_BW_DQ-NEXT:    kmovd %edi, %k0 -; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0 -; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1] -; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,18446744073709551615,18446744073709551615,0,0] -; VL_BW_DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2 -; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0 +; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0 +; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7] +; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0  ; VL_BW_DQ-NEXT:    kmovd %k0, %eax  ; VL_BW_DQ-NEXT:    # kill: def %al killed %al killed %eax  ; VL_BW_DQ-NEXT:    vzeroupper @@ -528,15 +527,15 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {  ;  ; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:  ; AVX512VL:       # %bb.0: -; AVX512VL-NEXT:    vpmovsxwq %xmm0, %zmm0 -; AVX512VL-NEXT:    vpsllq $63, %zmm0, %zmm0 -; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1 -; AVX512VL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 -; AVX512VL-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2 -; AVX512VL-NEXT:    vpsllq $63, %zmm2, %zmm0 -; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k0 +; AVX512VL-NEXT:    vpmovsxwd %xmm0, %ymm0 +; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k1 +; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z} +; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] +; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0 +; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0  ; AVX512VL-NEXT:    kmovw %k0, %eax  ; AVX512VL-NEXT:    # kill: def %al killed %al killed %eax  ; AVX512VL-NEXT:    vzeroupper @@ -546,11 +545,11 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {  ; VL_BW_DQ:       # %bb.0:  ; VL_BW_DQ-NEXT:    vpsllw $15, %xmm0, %xmm0  ; VL_BW_DQ-NEXT:    vpmovw2m %xmm0, %k0 -; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0 -; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] -; VL_BW_DQ-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 -; VL_BW_DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2 -; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0 +; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0 +; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; VL_BW_DQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1 +; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0  ; VL_BW_DQ-NEXT:    kmovd %k0, %eax  ; VL_BW_DQ-NEXT:    # kill: def %al killed %al killed %eax  ; VL_BW_DQ-NEXT:    vzeroupper diff --git a/test/CodeGen/X86/vector-shuffle-variable-128.ll b/test/CodeGen/X86/vector-shuffle-variable-128.ll index 0367737dda60..4de24d5fec4d 100644 --- a/test/CodeGen/X86/vector-shuffle-variable-128.ll +++ b/test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -414,63 +414,62 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %  ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)  ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax  ; SSE2-NEXT:    andl $15, %eax -; SSE2-NEXT:    leaq -{{[0-9]+}}(%rsp), %r10 -; SSE2-NEXT:    movzbl (%rax,%r10), %eax +; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSE2-NEXT:    movd %eax, %xmm8  ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax  ; SSE2-NEXT:    andl $15, %eax -; SSE2-NEXT:    movzbl (%rax,%r10), %eax +; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSE2-NEXT:    movd %eax, %xmm15  ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax  ; SSE2-NEXT:    andl $15, %eax -; SSE2-NEXT:    movzbl (%rax,%r10), %eax +; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSE2-NEXT:    movd %eax, %xmm9  ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax  ; SSE2-NEXT:    andl $15, %eax -; SSE2-NEXT:    movzbl (%rax,%r10), %eax +; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSE2-NEXT:    movd %eax, %xmm3  ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax  ; SSE2-NEXT:    andl $15, %eax -; SSE2-NEXT:    movzbl (%rax,%r10), %eax +; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSE2-NEXT:    movd %eax, %xmm10  ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax  ; SSE2-NEXT:    andl $15, %eax -; SSE2-NEXT:    movzbl (%rax,%r10), %eax +; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSE2-NEXT:    movd %eax, %xmm7  ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax  ; SSE2-NEXT:    andl $15, %eax -; SSE2-NEXT:    movzbl (%rax,%r10), %eax +; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSE2-NEXT:    movd %eax, %xmm11  ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax  ; SSE2-NEXT:    andl $15, %eax -; SSE2-NEXT:    movzbl (%rax,%r10), %eax +; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSE2-NEXT:    movd %eax, %xmm6 -; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax -; SSE2-NEXT:    andl $15, %eax -; SSE2-NEXT:    movzbl (%rax,%r10), %eax +; SSE2-NEXT:    andl $15, %ecx +; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %eax  ; SSE2-NEXT:    movd %eax, %xmm12 -; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax -; SSE2-NEXT:    andl $15, %eax -; SSE2-NEXT:    movzbl (%rax,%r10), %eax +; SSE2-NEXT:    andl $15, %edx +; SSE2-NEXT:    movzbl -24(%rsp,%rdx), %eax  ; SSE2-NEXT:    movd %eax, %xmm5 -; SSE2-NEXT:    andl $15, %r9d -; SSE2-NEXT:    movzbl (%r9,%r10), %eax +; SSE2-NEXT:    andl $15, %esi +; SSE2-NEXT:    movzbl -24(%rsp,%rsi), %eax  ; SSE2-NEXT:    movd %eax, %xmm13 -; SSE2-NEXT:    andl $15, %r8d -; SSE2-NEXT:    movzbl (%r8,%r10), %eax -; SSE2-NEXT:    movd %eax, %xmm4 -; SSE2-NEXT:    andl $15, %ecx -; SSE2-NEXT:    movzbl (%rcx,%r10), %eax +; SSE2-NEXT:    andl $15, %edi +; SSE2-NEXT:    movzbl -24(%rsp,%rdi), %eax +; SSE2-NEXT:    movd %eax, %xmm0 +; SSE2-NEXT:    andl $15, %r9d +; SSE2-NEXT:    movzbl -24(%rsp,%r9), %eax  ; SSE2-NEXT:    movd %eax, %xmm14 -; SSE2-NEXT:    andl $15, %edx -; SSE2-NEXT:    movzbl (%rdx,%r10), %eax +; SSE2-NEXT:    andl $15, %r8d +; SSE2-NEXT:    movzbl -24(%rsp,%r8), %eax  ; SSE2-NEXT:    movd %eax, %xmm1 -; SSE2-NEXT:    andl $15, %esi -; SSE2-NEXT:    movzbl (%rsi,%r10), %eax +; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT:    andl $15, %eax +; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax +; SSE2-NEXT:    movd %eax, %xmm4 +; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT:    andl $15, %eax +; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSE2-NEXT:    movd %eax, %xmm2 -; SSE2-NEXT:    andl $15, %edi -; SSE2-NEXT:    movzbl (%rdi,%r10), %eax -; SSE2-NEXT:    movd %eax, %xmm0  ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]  ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]  ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] @@ -479,12 +478,12 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %  ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]  ; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]  ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]  ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]  ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]  ; SSE2-NEXT:    retq  ; @@ -499,63 +498,62 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %  ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)  ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax  ; SSSE3-NEXT:    andl $15, %eax -; SSSE3-NEXT:    leaq -{{[0-9]+}}(%rsp), %r10 -; SSSE3-NEXT:    movzbl (%rax,%r10), %eax +; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSSE3-NEXT:    movd %eax, %xmm8  ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax  ; SSSE3-NEXT:    andl $15, %eax -; SSSE3-NEXT:    movzbl (%rax,%r10), %eax +; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSSE3-NEXT:    movd %eax, %xmm15  ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax  ; SSSE3-NEXT:    andl $15, %eax -; SSSE3-NEXT:    movzbl (%rax,%r10), %eax +; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSSE3-NEXT:    movd %eax, %xmm9  ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax  ; SSSE3-NEXT:    andl $15, %eax -; SSSE3-NEXT:    movzbl (%rax,%r10), %eax +; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSSE3-NEXT:    movd %eax, %xmm3  ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax  ; SSSE3-NEXT:    andl $15, %eax -; SSSE3-NEXT:    movzbl (%rax,%r10), %eax +; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSSE3-NEXT:    movd %eax, %xmm10  ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax  ; SSSE3-NEXT:    andl $15, %eax -; SSSE3-NEXT:    movzbl (%rax,%r10), %eax +; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSSE3-NEXT:    movd %eax, %xmm7  ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax  ; SSSE3-NEXT:    andl $15, %eax -; SSSE3-NEXT:    movzbl (%rax,%r10), %eax +; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSSE3-NEXT:    movd %eax, %xmm11  ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax  ; SSSE3-NEXT:    andl $15, %eax -; SSSE3-NEXT:    movzbl (%rax,%r10), %eax +; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSSE3-NEXT:    movd %eax, %xmm6 -; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT:    andl $15, %eax -; SSSE3-NEXT:    movzbl (%rax,%r10), %eax +; SSSE3-NEXT:    andl $15, %ecx +; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %eax  ; SSSE3-NEXT:    movd %eax, %xmm12 -; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT:    andl $15, %eax -; SSSE3-NEXT:    movzbl (%rax,%r10), %eax +; SSSE3-NEXT:    andl $15, %edx +; SSSE3-NEXT:    movzbl -24(%rsp,%rdx), %eax  ; SSSE3-NEXT:    movd %eax, %xmm5 -; SSSE3-NEXT:    andl $15, %r9d -; SSSE3-NEXT:    movzbl (%r9,%r10), %eax +; SSSE3-NEXT:    andl $15, %esi +; SSSE3-NEXT:    movzbl -24(%rsp,%rsi), %eax  ; SSSE3-NEXT:    movd %eax, %xmm13 -; SSSE3-NEXT:    andl $15, %r8d -; SSSE3-NEXT:    movzbl (%r8,%r10), %eax -; SSSE3-NEXT:    movd %eax, %xmm4 -; SSSE3-NEXT:    andl $15, %ecx -; SSSE3-NEXT:    movzbl (%rcx,%r10), %eax +; SSSE3-NEXT:    andl $15, %edi +; SSSE3-NEXT:    movzbl -24(%rsp,%rdi), %eax +; SSSE3-NEXT:    movd %eax, %xmm0 +; SSSE3-NEXT:    andl $15, %r9d +; SSSE3-NEXT:    movzbl -24(%rsp,%r9), %eax  ; SSSE3-NEXT:    movd %eax, %xmm14 -; SSSE3-NEXT:    andl $15, %edx -; SSSE3-NEXT:    movzbl (%rdx,%r10), %eax +; SSSE3-NEXT:    andl $15, %r8d +; SSSE3-NEXT:    movzbl -24(%rsp,%r8), %eax  ; SSSE3-NEXT:    movd %eax, %xmm1 -; SSSE3-NEXT:    andl $15, %esi -; SSSE3-NEXT:    movzbl (%rsi,%r10), %eax +; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT:    andl $15, %eax +; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax +; SSSE3-NEXT:    movd %eax, %xmm4 +; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT:    andl $15, %eax +; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSSE3-NEXT:    movd %eax, %xmm2 -; SSSE3-NEXT:    andl $15, %edi -; SSSE3-NEXT:    movzbl (%rdi,%r10), %eax -; SSSE3-NEXT:    movd %eax, %xmm0  ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]  ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]  ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] @@ -564,12 +562,12 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %  ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]  ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]  ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]  ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]  ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]  ; SSSE3-NEXT:    retq  ; @@ -583,49 +581,48 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %  ; SSE41-NEXT:    # kill: def %edi killed %edi def %rdi  ; SSE41-NEXT:    andl $15, %edi  ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax -; SSE41-NEXT:    movzbl (%rdi,%rax), %edi -; SSE41-NEXT:    movd %edi, %xmm0 +; SSE41-NEXT:    movzbl -24(%rsp,%rdi), %eax +; SSE41-NEXT:    movd %eax, %xmm0  ; SSE41-NEXT:    andl $15, %esi -; SSE41-NEXT:    pinsrb $1, (%rsi,%rax), %xmm0 +; SSE41-NEXT:    pinsrb $1, -24(%rsp,%rsi), %xmm0  ; SSE41-NEXT:    andl $15, %edx -; SSE41-NEXT:    pinsrb $2, (%rdx,%rax), %xmm0 +; SSE41-NEXT:    pinsrb $2, -24(%rsp,%rdx), %xmm0  ; SSE41-NEXT:    andl $15, %ecx -; SSE41-NEXT:    pinsrb $3, (%rcx,%rax), %xmm0 +; SSE41-NEXT:    pinsrb $3, -24(%rsp,%rcx), %xmm0  ; SSE41-NEXT:    andl $15, %r8d -; SSE41-NEXT:    pinsrb $4, (%r8,%rax), %xmm0 +; SSE41-NEXT:    pinsrb $4, -24(%rsp,%r8), %xmm0  ; SSE41-NEXT:    andl $15, %r9d -; SSE41-NEXT:    pinsrb $5, (%r9,%rax), %xmm0 -; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT:    andl $15, %ecx -; SSE41-NEXT:    pinsrb $6, (%rcx,%rax), %xmm0 -; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT:    andl $15, %ecx -; SSE41-NEXT:    pinsrb $7, (%rcx,%rax), %xmm0 -; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT:    andl $15, %ecx -; SSE41-NEXT:    pinsrb $8, (%rcx,%rax), %xmm0 -; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT:    andl $15, %ecx -; SSE41-NEXT:    pinsrb $9, (%rcx,%rax), %xmm0 -; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT:    andl $15, %ecx -; SSE41-NEXT:    pinsrb $10, (%rcx,%rax), %xmm0 -; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT:    andl $15, %ecx -; SSE41-NEXT:    pinsrb $11, (%rcx,%rax), %xmm0 -; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT:    andl $15, %ecx -; SSE41-NEXT:    pinsrb $12, (%rcx,%rax), %xmm0 -; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT:    andl $15, %ecx -; SSE41-NEXT:    pinsrb $13, (%rcx,%rax), %xmm0 -; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT:    andl $15, %ecx -; SSE41-NEXT:    pinsrb $14, (%rcx,%rax), %xmm0 -; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; SSE41-NEXT:    andl $15, %ecx -; SSE41-NEXT:    pinsrb $15, (%rcx,%rax), %xmm0 +; SSE41-NEXT:    pinsrb $5, -24(%rsp,%r9), %xmm0 +; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT:    andl $15, %eax +; SSE41-NEXT:    pinsrb $6, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT:    andl $15, %eax +; SSE41-NEXT:    pinsrb $7, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT:    andl $15, %eax +; SSE41-NEXT:    pinsrb $8, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT:    andl $15, %eax +; SSE41-NEXT:    pinsrb $9, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT:    andl $15, %eax +; SSE41-NEXT:    pinsrb $10, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT:    andl $15, %eax +; SSE41-NEXT:    pinsrb $11, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT:    andl $15, %eax +; SSE41-NEXT:    pinsrb $12, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT:    andl $15, %eax +; SSE41-NEXT:    pinsrb $13, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT:    andl $15, %eax +; SSE41-NEXT:    pinsrb $14, -24(%rsp,%rax), %xmm0 +; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; SSE41-NEXT:    andl $15, %eax +; SSE41-NEXT:    pinsrb $15, -24(%rsp,%rax), %xmm0  ; SSE41-NEXT:    retq  ;  ; AVX-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: @@ -638,49 +635,48 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %  ; AVX-NEXT:    # kill: def %edi killed %edi def %rdi  ; AVX-NEXT:    andl $15, %edi  ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax -; AVX-NEXT:    movzbl (%rdi,%rax), %edi -; AVX-NEXT:    vmovd %edi, %xmm0 +; AVX-NEXT:    movzbl -24(%rsp,%rdi), %eax +; AVX-NEXT:    vmovd %eax, %xmm0  ; AVX-NEXT:    andl $15, %esi -; AVX-NEXT:    vpinsrb $1, (%rsi,%rax), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $1, -24(%rsp,%rsi), %xmm0, %xmm0  ; AVX-NEXT:    andl $15, %edx -; AVX-NEXT:    vpinsrb $2, (%rdx,%rax), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $2, -24(%rsp,%rdx), %xmm0, %xmm0  ; AVX-NEXT:    andl $15, %ecx -; AVX-NEXT:    vpinsrb $3, (%rcx,%rax), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $3, -24(%rsp,%rcx), %xmm0, %xmm0  ; AVX-NEXT:    andl $15, %r8d -; AVX-NEXT:    vpinsrb $4, (%r8,%rax), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $4, -24(%rsp,%r8), %xmm0, %xmm0  ; AVX-NEXT:    andl $15, %r9d -; AVX-NEXT:    vpinsrb $5, (%r9,%rax), %xmm0, %xmm0 -; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT:    andl $15, %ecx -; AVX-NEXT:    vpinsrb $6, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT:    andl $15, %ecx -; AVX-NEXT:    vpinsrb $7, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT:    andl $15, %ecx -; AVX-NEXT:    vpinsrb $8, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT:    andl $15, %ecx -; AVX-NEXT:    vpinsrb $9, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT:    andl $15, %ecx -; AVX-NEXT:    vpinsrb $10, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT:    andl $15, %ecx -; AVX-NEXT:    vpinsrb $11, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT:    andl $15, %ecx -; AVX-NEXT:    vpinsrb $12, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT:    andl $15, %ecx -; AVX-NEXT:    vpinsrb $13, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT:    andl $15, %ecx -; AVX-NEXT:    vpinsrb $14, (%rcx,%rax), %xmm0, %xmm0 -; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx -; AVX-NEXT:    andl $15, %ecx -; AVX-NEXT:    vpinsrb $15, (%rcx,%rax), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $5, -24(%rsp,%r9), %xmm0, %xmm0 +; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT:    andl $15, %eax +; AVX-NEXT:    vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT:    andl $15, %eax +; AVX-NEXT:    vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT:    andl $15, %eax +; AVX-NEXT:    vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT:    andl $15, %eax +; AVX-NEXT:    vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT:    andl $15, %eax +; AVX-NEXT:    vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT:    andl $15, %eax +; AVX-NEXT:    vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT:    andl $15, %eax +; AVX-NEXT:    vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT:    andl $15, %eax +; AVX-NEXT:    vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT:    andl $15, %eax +; AVX-NEXT:    vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0 +; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax +; AVX-NEXT:    andl $15, %eax +; AVX-NEXT:    vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0  ; AVX-NEXT:    retq    %x0  = extractelement <16 x i8> %x, i8 %i0    %x1  = extractelement <16 x i8> %x, i8 %i1 @@ -819,69 +815,68 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*  ; SSE2:       # %bb.0:  ; SSE2-NEXT:    movzbl (%rdi), %eax  ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT:    movzbl 15(%rdi), %edx -; SSE2-NEXT:    andl $15, %edx -; SSE2-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx -; SSE2-NEXT:    movd %edx, %xmm8 -; SSE2-NEXT:    movzbl 14(%rdi), %edx -; SSE2-NEXT:    andl $15, %edx -; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx -; SSE2-NEXT:    movd %edx, %xmm15 -; SSE2-NEXT:    movzbl 13(%rdi), %edx -; SSE2-NEXT:    andl $15, %edx -; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx -; SSE2-NEXT:    movd %edx, %xmm9 -; SSE2-NEXT:    movzbl 12(%rdi), %edx -; SSE2-NEXT:    andl $15, %edx -; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx -; SSE2-NEXT:    movd %edx, %xmm3 -; SSE2-NEXT:    movzbl 11(%rdi), %edx -; SSE2-NEXT:    andl $15, %edx -; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx -; SSE2-NEXT:    movd %edx, %xmm10 -; SSE2-NEXT:    movzbl 10(%rdi), %edx -; SSE2-NEXT:    andl $15, %edx -; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx -; SSE2-NEXT:    movd %edx, %xmm7 -; SSE2-NEXT:    movzbl 9(%rdi), %edx -; SSE2-NEXT:    andl $15, %edx -; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx -; SSE2-NEXT:    movd %edx, %xmm11 -; SSE2-NEXT:    movzbl 8(%rdi), %edx -; SSE2-NEXT:    andl $15, %edx -; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx -; SSE2-NEXT:    movd %edx, %xmm6 -; SSE2-NEXT:    movzbl 7(%rdi), %edx -; SSE2-NEXT:    andl $15, %edx -; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx -; SSE2-NEXT:    movd %edx, %xmm12 -; SSE2-NEXT:    movzbl 6(%rdi), %edx -; SSE2-NEXT:    andl $15, %edx -; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx -; SSE2-NEXT:    movd %edx, %xmm5 -; SSE2-NEXT:    movzbl 5(%rdi), %edx -; SSE2-NEXT:    andl $15, %edx -; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx -; SSE2-NEXT:    movd %edx, %xmm13 -; SSE2-NEXT:    movzbl 4(%rdi), %edx -; SSE2-NEXT:    andl $15, %edx -; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx -; SSE2-NEXT:    movd %edx, %xmm4 -; SSE2-NEXT:    movzbl 3(%rdi), %edx -; SSE2-NEXT:    andl $15, %edx -; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx -; SSE2-NEXT:    movd %edx, %xmm14 -; SSE2-NEXT:    movzbl 2(%rdi), %edx -; SSE2-NEXT:    andl $15, %edx -; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx -; SSE2-NEXT:    movd %edx, %xmm1 -; SSE2-NEXT:    movzbl 1(%rdi), %edx -; SSE2-NEXT:    andl $15, %edx -; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx -; SSE2-NEXT:    movd %edx, %xmm2 +; SSE2-NEXT:    movzbl 15(%rdi), %ecx +; SSE2-NEXT:    andl $15, %ecx +; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT:    movd %ecx, %xmm8 +; SSE2-NEXT:    movzbl 14(%rdi), %ecx +; SSE2-NEXT:    andl $15, %ecx +; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT:    movd %ecx, %xmm15 +; SSE2-NEXT:    movzbl 13(%rdi), %ecx +; SSE2-NEXT:    andl $15, %ecx +; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT:    movd %ecx, %xmm9 +; SSE2-NEXT:    movzbl 12(%rdi), %ecx +; SSE2-NEXT:    andl $15, %ecx +; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT:    movd %ecx, %xmm3 +; SSE2-NEXT:    movzbl 11(%rdi), %ecx +; SSE2-NEXT:    andl $15, %ecx +; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT:    movd %ecx, %xmm10 +; SSE2-NEXT:    movzbl 10(%rdi), %ecx +; SSE2-NEXT:    andl $15, %ecx +; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT:    movd %ecx, %xmm7 +; SSE2-NEXT:    movzbl 9(%rdi), %ecx +; SSE2-NEXT:    andl $15, %ecx +; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT:    movd %ecx, %xmm11 +; SSE2-NEXT:    movzbl 8(%rdi), %ecx +; SSE2-NEXT:    andl $15, %ecx +; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT:    movd %ecx, %xmm6 +; SSE2-NEXT:    movzbl 7(%rdi), %ecx +; SSE2-NEXT:    andl $15, %ecx +; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT:    movd %ecx, %xmm12 +; SSE2-NEXT:    movzbl 6(%rdi), %ecx +; SSE2-NEXT:    andl $15, %ecx +; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT:    movd %ecx, %xmm5 +; SSE2-NEXT:    movzbl 5(%rdi), %ecx +; SSE2-NEXT:    andl $15, %ecx +; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT:    movd %ecx, %xmm13 +; SSE2-NEXT:    movzbl 4(%rdi), %ecx +; SSE2-NEXT:    andl $15, %ecx +; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT:    movd %ecx, %xmm4 +; SSE2-NEXT:    movzbl 3(%rdi), %ecx +; SSE2-NEXT:    andl $15, %ecx +; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT:    movd %ecx, %xmm14 +; SSE2-NEXT:    movzbl 2(%rdi), %ecx +; SSE2-NEXT:    andl $15, %ecx +; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT:    movd %ecx, %xmm1 +; SSE2-NEXT:    movzbl 1(%rdi), %ecx +; SSE2-NEXT:    andl $15, %ecx +; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSE2-NEXT:    movd %ecx, %xmm2  ; SSE2-NEXT:    andl $15, %eax -; SSE2-NEXT:    movzbl (%rax,%rcx), %eax +; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSE2-NEXT:    movd %eax, %xmm0  ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]  ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] @@ -904,69 +899,68 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*  ; SSSE3:       # %bb.0:  ; SSSE3-NEXT:    movzbl (%rdi), %eax  ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSSE3-NEXT:    movzbl 15(%rdi), %edx -; SSSE3-NEXT:    andl $15, %edx -; SSSE3-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx -; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT:    movd %edx, %xmm8 -; SSSE3-NEXT:    movzbl 14(%rdi), %edx -; SSSE3-NEXT:    andl $15, %edx -; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT:    movd %edx, %xmm15 -; SSSE3-NEXT:    movzbl 13(%rdi), %edx -; SSSE3-NEXT:    andl $15, %edx -; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT:    movd %edx, %xmm9 -; SSSE3-NEXT:    movzbl 12(%rdi), %edx -; SSSE3-NEXT:    andl $15, %edx -; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT:    movd %edx, %xmm3 -; SSSE3-NEXT:    movzbl 11(%rdi), %edx -; SSSE3-NEXT:    andl $15, %edx -; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT:    movd %edx, %xmm10 -; SSSE3-NEXT:    movzbl 10(%rdi), %edx -; SSSE3-NEXT:    andl $15, %edx -; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT:    movd %edx, %xmm7 -; SSSE3-NEXT:    movzbl 9(%rdi), %edx -; SSSE3-NEXT:    andl $15, %edx -; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT:    movd %edx, %xmm11 -; SSSE3-NEXT:    movzbl 8(%rdi), %edx -; SSSE3-NEXT:    andl $15, %edx -; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT:    movd %edx, %xmm6 -; SSSE3-NEXT:    movzbl 7(%rdi), %edx -; SSSE3-NEXT:    andl $15, %edx -; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT:    movd %edx, %xmm12 -; SSSE3-NEXT:    movzbl 6(%rdi), %edx -; SSSE3-NEXT:    andl $15, %edx -; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT:    movd %edx, %xmm5 -; SSSE3-NEXT:    movzbl 5(%rdi), %edx -; SSSE3-NEXT:    andl $15, %edx -; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT:    movd %edx, %xmm13 -; SSSE3-NEXT:    movzbl 4(%rdi), %edx -; SSSE3-NEXT:    andl $15, %edx -; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT:    movd %edx, %xmm4 -; SSSE3-NEXT:    movzbl 3(%rdi), %edx -; SSSE3-NEXT:    andl $15, %edx -; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT:    movd %edx, %xmm14 -; SSSE3-NEXT:    movzbl 2(%rdi), %edx -; SSSE3-NEXT:    andl $15, %edx -; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT:    movd %edx, %xmm1 -; SSSE3-NEXT:    movzbl 1(%rdi), %edx -; SSSE3-NEXT:    andl $15, %edx -; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx -; SSSE3-NEXT:    movd %edx, %xmm2 +; SSSE3-NEXT:    movzbl 15(%rdi), %ecx +; SSSE3-NEXT:    andl $15, %ecx +; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT:    movd %ecx, %xmm8 +; SSSE3-NEXT:    movzbl 14(%rdi), %ecx +; SSSE3-NEXT:    andl $15, %ecx +; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT:    movd %ecx, %xmm15 +; SSSE3-NEXT:    movzbl 13(%rdi), %ecx +; SSSE3-NEXT:    andl $15, %ecx +; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT:    movd %ecx, %xmm9 +; SSSE3-NEXT:    movzbl 12(%rdi), %ecx +; SSSE3-NEXT:    andl $15, %ecx +; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT:    movd %ecx, %xmm3 +; SSSE3-NEXT:    movzbl 11(%rdi), %ecx +; SSSE3-NEXT:    andl $15, %ecx +; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT:    movd %ecx, %xmm10 +; SSSE3-NEXT:    movzbl 10(%rdi), %ecx +; SSSE3-NEXT:    andl $15, %ecx +; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT:    movd %ecx, %xmm7 +; SSSE3-NEXT:    movzbl 9(%rdi), %ecx +; SSSE3-NEXT:    andl $15, %ecx +; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT:    movd %ecx, %xmm11 +; SSSE3-NEXT:    movzbl 8(%rdi), %ecx +; SSSE3-NEXT:    andl $15, %ecx +; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT:    movd %ecx, %xmm6 +; SSSE3-NEXT:    movzbl 7(%rdi), %ecx +; SSSE3-NEXT:    andl $15, %ecx +; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT:    movd %ecx, %xmm12 +; SSSE3-NEXT:    movzbl 6(%rdi), %ecx +; SSSE3-NEXT:    andl $15, %ecx +; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT:    movd %ecx, %xmm5 +; SSSE3-NEXT:    movzbl 5(%rdi), %ecx +; SSSE3-NEXT:    andl $15, %ecx +; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT:    movd %ecx, %xmm13 +; SSSE3-NEXT:    movzbl 4(%rdi), %ecx +; SSSE3-NEXT:    andl $15, %ecx +; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT:    movd %ecx, %xmm4 +; SSSE3-NEXT:    movzbl 3(%rdi), %ecx +; SSSE3-NEXT:    andl $15, %ecx +; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT:    movd %ecx, %xmm14 +; SSSE3-NEXT:    movzbl 2(%rdi), %ecx +; SSSE3-NEXT:    andl $15, %ecx +; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT:    movd %ecx, %xmm1 +; SSSE3-NEXT:    movzbl 1(%rdi), %ecx +; SSSE3-NEXT:    andl $15, %ecx +; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %ecx +; SSSE3-NEXT:    movd %ecx, %xmm2  ; SSSE3-NEXT:    andl $15, %eax -; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax +; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSSE3-NEXT:    movd %eax, %xmm0  ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]  ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] @@ -990,54 +984,53 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*  ; SSE41-NEXT:    movzbl (%rdi), %eax  ; SSE41-NEXT:    andl $15, %eax  ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE41-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx -; SSE41-NEXT:    movzbl (%rax,%rcx), %eax +; SSE41-NEXT:    movzbl -24(%rsp,%rax), %eax  ; SSE41-NEXT:    movd %eax, %xmm0  ; SSE41-NEXT:    movzbl 1(%rdi), %eax  ; SSE41-NEXT:    andl $15, %eax -; SSE41-NEXT:    pinsrb $1, (%rax,%rcx), %xmm0 +; SSE41-NEXT:    pinsrb $1, -24(%rsp,%rax), %xmm0  ; SSE41-NEXT:    movzbl 2(%rdi), %eax  ; SSE41-NEXT:    andl $15, %eax -; SSE41-NEXT:    pinsrb $2, (%rax,%rcx), %xmm0 +; SSE41-NEXT:    pinsrb $2, -24(%rsp,%rax), %xmm0  ; SSE41-NEXT:    movzbl 3(%rdi), %eax  ; SSE41-NEXT:    andl $15, %eax -; SSE41-NEXT:    pinsrb $3, (%rax,%rcx), %xmm0 +; SSE41-NEXT:    pinsrb $3, -24(%rsp,%rax), %xmm0  ; SSE41-NEXT:    movzbl 4(%rdi), %eax  ; SSE41-NEXT:    andl $15, %eax -; SSE41-NEXT:    pinsrb $4, (%rax,%rcx), %xmm0 +; SSE41-NEXT:    pinsrb $4, -24(%rsp,%rax), %xmm0  ; SSE41-NEXT:    movzbl 5(%rdi), %eax  ; SSE41-NEXT:    andl $15, %eax -; SSE41-NEXT:    pinsrb $5, (%rax,%rcx), %xmm0 +; SSE41-NEXT:    pinsrb $5, -24(%rsp,%rax), %xmm0  ; SSE41-NEXT:    movzbl 6(%rdi), %eax  ; SSE41-NEXT:    andl $15, %eax -; SSE41-NEXT:    pinsrb $6, (%rax,%rcx), %xmm0 +; SSE41-NEXT:    pinsrb $6, -24(%rsp,%rax), %xmm0  ; SSE41-NEXT:    movzbl 7(%rdi), %eax  ; SSE41-NEXT:    andl $15, %eax -; SSE41-NEXT:    pinsrb $7, (%rax,%rcx), %xmm0 +; SSE41-NEXT:    pinsrb $7, -24(%rsp,%rax), %xmm0  ; SSE41-NEXT:    movzbl 8(%rdi), %eax  ; SSE41-NEXT:    andl $15, %eax -; SSE41-NEXT:    pinsrb $8, (%rax,%rcx), %xmm0 +; SSE41-NEXT:    pinsrb $8, -24(%rsp,%rax), %xmm0  ; SSE41-NEXT:    movzbl 9(%rdi), %eax  ; SSE41-NEXT:    andl $15, %eax -; SSE41-NEXT:    pinsrb $9, (%rax,%rcx), %xmm0 +; SSE41-NEXT:    pinsrb $9, -24(%rsp,%rax), %xmm0  ; SSE41-NEXT:    movzbl 10(%rdi), %eax  ; SSE41-NEXT:    andl $15, %eax -; SSE41-NEXT:    pinsrb $10, (%rax,%rcx), %xmm0 +; SSE41-NEXT:    pinsrb $10, -24(%rsp,%rax), %xmm0  ; SSE41-NEXT:    movzbl 11(%rdi), %eax  ; SSE41-NEXT:    andl $15, %eax -; SSE41-NEXT:    pinsrb $11, (%rax,%rcx), %xmm0 +; SSE41-NEXT:    pinsrb $11, -24(%rsp,%rax), %xmm0  ; SSE41-NEXT:    movzbl 12(%rdi), %eax  ; SSE41-NEXT:    andl $15, %eax -; SSE41-NEXT:    pinsrb $12, (%rax,%rcx), %xmm0 +; SSE41-NEXT:    pinsrb $12, -24(%rsp,%rax), %xmm0  ; SSE41-NEXT:    movzbl 13(%rdi), %eax  ; SSE41-NEXT:    andl $15, %eax -; SSE41-NEXT:    pinsrb $13, (%rax,%rcx), %xmm0 +; SSE41-NEXT:    pinsrb $13, -24(%rsp,%rax), %xmm0  ; SSE41-NEXT:    movzbl 14(%rdi), %eax  ; SSE41-NEXT:    andl $15, %eax -; SSE41-NEXT:    pinsrb $14, (%rax,%rcx), %xmm0 +; SSE41-NEXT:    pinsrb $14, -24(%rsp,%rax), %xmm0  ; SSE41-NEXT:    movzbl 15(%rdi), %eax  ; SSE41-NEXT:    andl $15, %eax -; SSE41-NEXT:    pinsrb $15, (%rax,%rcx), %xmm0 +; SSE41-NEXT:    pinsrb $15, -24(%rsp,%rax), %xmm0  ; SSE41-NEXT:    retq  ;  ; AVX-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: @@ -1045,54 +1038,53 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*  ; AVX-NEXT:    movzbl (%rdi), %eax  ; AVX-NEXT:    andl $15, %eax  ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT:    movzbl (%rax,%rcx), %eax +; AVX-NEXT:    movzbl -24(%rsp,%rax), %eax  ; AVX-NEXT:    vmovd %eax, %xmm0  ; AVX-NEXT:    movzbl 1(%rdi), %eax  ; AVX-NEXT:    andl $15, %eax -; AVX-NEXT:    vpinsrb $1, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $1, -24(%rsp,%rax), %xmm0, %xmm0  ; AVX-NEXT:    movzbl 2(%rdi), %eax  ; AVX-NEXT:    andl $15, %eax -; AVX-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $2, -24(%rsp,%rax), %xmm0, %xmm0  ; AVX-NEXT:    movzbl 3(%rdi), %eax  ; AVX-NEXT:    andl $15, %eax -; AVX-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $3, -24(%rsp,%rax), %xmm0, %xmm0  ; AVX-NEXT:    movzbl 4(%rdi), %eax  ; AVX-NEXT:    andl $15, %eax -; AVX-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $4, -24(%rsp,%rax), %xmm0, %xmm0  ; AVX-NEXT:    movzbl 5(%rdi), %eax  ; AVX-NEXT:    andl $15, %eax -; AVX-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $5, -24(%rsp,%rax), %xmm0, %xmm0  ; AVX-NEXT:    movzbl 6(%rdi), %eax  ; AVX-NEXT:    andl $15, %eax -; AVX-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0  ; AVX-NEXT:    movzbl 7(%rdi), %eax  ; AVX-NEXT:    andl $15, %eax -; AVX-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0  ; AVX-NEXT:    movzbl 8(%rdi), %eax  ; AVX-NEXT:    andl $15, %eax -; AVX-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0  ; AVX-NEXT:    movzbl 9(%rdi), %eax  ; AVX-NEXT:    andl $15, %eax -; AVX-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0  ; AVX-NEXT:    movzbl 10(%rdi), %eax  ; AVX-NEXT:    andl $15, %eax -; AVX-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0  ; AVX-NEXT:    movzbl 11(%rdi), %eax  ; AVX-NEXT:    andl $15, %eax -; AVX-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0  ; AVX-NEXT:    movzbl 12(%rdi), %eax  ; AVX-NEXT:    andl $15, %eax -; AVX-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0  ; AVX-NEXT:    movzbl 13(%rdi), %eax  ; AVX-NEXT:    andl $15, %eax -; AVX-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0  ; AVX-NEXT:    movzbl 14(%rdi), %eax  ; AVX-NEXT:    andl $15, %eax -; AVX-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0  ; AVX-NEXT:    movzbl 15(%rdi), %eax  ; AVX-NEXT:    andl $15, %eax -; AVX-NEXT:    vpinsrb $15, (%rax,%rcx), %xmm0, %xmm0 +; AVX-NEXT:    vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0  ; AVX-NEXT:    retq    %p0  = getelementptr inbounds i8, i8* %i, i64 0    %p1  = getelementptr inbounds i8, i8* %i, i64 1 diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index d25117ca715c..fd4c30fb327b 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -1504,12 +1504,9 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {  ; AVX512VL:       # %bb.0: # %entry  ; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0  ; AVX512VL-NEXT:    vpmovqd %ymm1, %xmm1 -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0  ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]  ; AVX512VL-NEXT:    vzeroupper  ; AVX512VL-NEXT:    retq @@ -1531,12 +1528,9 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {  ; AVX512BWVL:       # %bb.0: # %entry  ; AVX512BWVL-NEXT:    vpmovqd %ymm0, %xmm0  ; AVX512BWVL-NEXT:    vpmovqd %ymm1, %xmm1 -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0  ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]  ; AVX512BWVL-NEXT:    vzeroupper  ; AVX512BWVL-NEXT:    retq @@ -1647,43 +1641,13 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {  ; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]  ; AVX-NEXT:    retq  ; -; AVX512F-LABEL: trunc2x4i32_8i16: -; AVX512F:       # %bb.0: # %entry -; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT:    retq -; -; AVX512VL-LABEL: trunc2x4i32_8i16: -; AVX512VL:       # %bb.0: # %entry -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT:    retq -; -; AVX512BW-LABEL: trunc2x4i32_8i16: -; AVX512BW:       # %bb.0: # %entry -; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT:    retq -; -; AVX512BWVL-LABEL: trunc2x4i32_8i16: -; AVX512BWVL:       # %bb.0: # %entry -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT:    retq +; AVX512-LABEL: trunc2x4i32_8i16: +; AVX512:       # %bb.0: # %entry +; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT:    retq  entry:    %0 = trunc <4 x i32> %a to <4 x i16>    %1 = trunc <4 x i32> %b to <4 x i16> diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll index 94eadd8c1aaf..3ea65573fc70 100644 --- a/test/CodeGen/X86/vector-zext.ll +++ b/test/CodeGen/X86/vector-zext.ll @@ -3,7 +3,8 @@  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F  ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW @@ -1911,11 +1912,28 @@ define <4 x i32> @shuf_zext_8i16_to_4i32_offset1(<8 x i16> %A) nounwind uwtable  ; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero  ; SSE41-NEXT:    retq  ; -; AVX-LABEL: shuf_zext_8i16_to_4i32_offset1: -; AVX:       # %bb.0: # %entry -; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT:    retq +; AVX1-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX1:       # %bb.0: # %entry +; AVX1-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT:    retq +; +; AVX2-SLOW-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX2-SLOW:       # %bb.0: # %entry +; AVX2-SLOW-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX2-SLOW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-SLOW-NEXT:    retq +; +; AVX2-FAST-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX2-FAST:       # %bb.0: # %entry +; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero +; AVX2-FAST-NEXT:    retq +; +; AVX512-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX512:       # %bb.0: # %entry +; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX512-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT:    retq  entry:    %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8>    %Z = bitcast <8 x i16> %B to <4 x i32> diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll index 985f6a861b93..3d4355e5f39c 100644 --- a/test/CodeGen/X86/vselect.ll +++ b/test/CodeGen/X86/vselect.ll @@ -182,7 +182,7 @@ define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) {  define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {  ; SSE2-LABEL: test11:  ; SSE2:       # %bb.0: -; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE2-NEXT:    movaps {{.*#+}} xmm2 = [0,65535,65535,0,0,65535,65535,0]  ; SSE2-NEXT:    andps %xmm2, %xmm0  ; SSE2-NEXT:    andnps %xmm1, %xmm2  ; SSE2-NEXT:    orps %xmm2, %xmm0 @@ -190,12 +190,12 @@ define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {  ;  ; SSE41-LABEL: test11:  ; SSE41:       # %bb.0: -; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7]  ; SSE41-NEXT:    retq  ;  ; AVX-LABEL: test11:  ; AVX:       # %bb.0: -; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4],xmm0[5,6],xmm1[7]  ; AVX-NEXT:    retq    %1 = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b    ret <8 x i16> %1  | 
