From 3edec5c15a78e4abba7eb9102fef3891c84ebdfb Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Sat, 19 Jan 2019 18:44:22 +0000 Subject: Vendor import of llvm release_80 branch r351543: https://llvm.org/svn/llvm-project/llvm/branches/release_80@351543 --- CMakeLists.txt | 2 +- include/llvm/CodeGen/MachineFunction.h | 4 + include/llvm/IR/IntrinsicsAMDGPU.td | 18 ++ lib/CodeGen/AsmPrinter/WinException.cpp | 23 +- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 2 + lib/MC/MCWin64EH.cpp | 52 +++- lib/Target/AArch64/AArch64AsmPrinter.cpp | 28 +++ lib/Target/AArch64/AArch64FrameLowering.cpp | 4 + lib/Target/AArch64/AArch64ISelLowering.cpp | 28 +++ lib/Target/AArch64/AArch64InstrInfo.td | 7 + lib/Target/AArch64/AArch64RegisterInfo.cpp | 7 + lib/Target/AMDGPU/AMDGPU.h | 2 +- lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 1 + lib/Target/AMDGPU/AMDGPUISelLowering.h | 1 + lib/Target/AMDGPU/AMDGPUSearchableTables.td | 2 + lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 2 + lib/Target/AMDGPU/DSInstructions.td | 5 + lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 22 +- lib/Target/AMDGPU/SIISelLowering.cpp | 61 +++++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 14 +- lib/Target/AMDGPU/SIInstrInfo.cpp | 13 +- lib/Target/AMDGPU/SIInstrInfo.h | 2 + lib/Target/AMDGPU/SIInstrInfo.td | 5 + lib/Target/MSP430/MSP430AsmPrinter.cpp | 32 +++ lib/Target/X86/X86ISelLowering.cpp | 2 + lib/Target/X86/X86ISelLowering.h | 6 +- lib/Target/X86/X86InstrAVX512.td | 79 ++++-- lib/Target/X86/X86InstrFragmentsSIMD.td | 2 + lib/Target/X86/X86InstrSSE.td | 30 +-- lib/Target/X86/X86IntrinsicsInfo.h | 36 +-- .../InstCombine/InstructionCombining.cpp | 8 +- lib/Transforms/Scalar/SROA.cpp | 5 +- lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 +- test/CodeGen/AArch64/seh-finally.ll | 67 ++++++ test/CodeGen/AArch64/seh-localescape.ll | 30 +++ test/CodeGen/AArch64/wineh4.mir | 6 +- test/CodeGen/AArch64/wineh8.mir | 225 +++++++++++++++++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll | 96 ++++++++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll | 45 ++++ test/CodeGen/MSP430/2009-12-21-FrameAddr.ll | 4 +- test/CodeGen/MSP430/fp.ll | 2 + test/CodeGen/MSP430/interrupt.ll | 4 + test/CodeGen/X86/avx2-intrinsics-x86.ll | 240 ++++++++++++++----- test/CodeGen/X86/avx512-intrinsics.ll | 28 ++- test/CodeGen/X86/avx512bw-intrinsics.ll | 24 +- test/CodeGen/X86/avx512bwvl-intrinsics.ll | 64 +++-- test/Transforms/InstCombine/sink-alloca.ll | 52 ++++ test/Transforms/SLPVectorizer/X86/PR39774.ll | 266 ++++++++++++++------- test/Transforms/SLPVectorizer/X86/PR40310.ll | 18 +- test/Transforms/SROA/basictest.ll | 49 ++++ utils/release/build_llvm_package.bat | 18 +- 51 files changed, 1447 insertions(+), 299 deletions(-) create mode 100644 test/CodeGen/AArch64/seh-finally.ll create mode 100644 test/CodeGen/AArch64/seh-localescape.ll create mode 100644 test/CodeGen/AArch64/wineh8.mir create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll create mode 100644 test/Transforms/InstCombine/sink-alloca.ll diff --git a/CMakeLists.txt b/CMakeLists.txt index b144be50f843..6e5221ebfd33 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,7 @@ if(NOT DEFINED LLVM_VERSION_PATCH) set(LLVM_VERSION_PATCH 0) endif() if(NOT DEFINED LLVM_VERSION_SUFFIX) - set(LLVM_VERSION_SUFFIX svn) + set(LLVM_VERSION_SUFFIX "") endif() if (NOT PACKAGE_VERSION) diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h index 25edf5bcce51..6dbe1650adab 100644 --- a/include/llvm/CodeGen/MachineFunction.h +++ b/include/llvm/CodeGen/MachineFunction.h @@ -329,6 +329,7 @@ class MachineFunction { bool CallsUnwindInit = false; bool HasEHScopes = false; bool HasEHFunclets = false; + bool HasLocalEscape = false; /// List of C++ TypeInfo used. std::vector TypeInfos; @@ -811,6 +812,9 @@ public: bool hasEHFunclets() const { return HasEHFunclets; } void setHasEHFunclets(bool V) { HasEHFunclets = V; } + bool hasLocalEscape() const { return HasLocalEscape; } + void setHasLocalEscape(bool V) { HasLocalEscape = V; } + /// Find or create an LandingPadInfo for the specified MachineBasicBlock. LandingPadInfo &getOrCreateLandingPadInfo(MachineBasicBlock *LandingPad); diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td index 7913ce828fbc..6585cb71769a 100644 --- a/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/include/llvm/IR/IntrinsicsAMDGPU.td @@ -392,6 +392,24 @@ class AMDGPULDSF32Intrin : [IntrArgMemOnly, NoCapture<0>] >; +class AMDGPUDSOrderedIntrinsic : Intrinsic< + [llvm_i32_ty], + // M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that + // the bit packing can be optimized at the IR level. + [LLVMQualPointerType, // IntToPtr(M0) + llvm_i32_ty, // value to add or swap + llvm_i32_ty, // ordering + llvm_i32_ty, // scope + llvm_i1_ty, // isVolatile + llvm_i32_ty, // ordered count index (OA index), also added to the address + llvm_i1_ty, // wave release, usually set to 1 + llvm_i1_ty], // wave done, set to 1 for the last ordered instruction + [NoCapture<0>] +>; + +def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic; +def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic; + def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_faddf">; def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fminf">; def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmaxf">; diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp index cf8e8c69bc2a..92df09b7d6a2 100644 --- a/lib/CodeGen/AsmPrinter/WinException.cpp +++ b/lib/CodeGen/AsmPrinter/WinException.cpp @@ -545,15 +545,17 @@ void WinException::emitCSpecificHandlerTable(const MachineFunction *MF) { OS.AddComment(Comment); }; - // Emit a label assignment with the SEH frame offset so we can use it for - // llvm.eh.recoverfp. - StringRef FLinkageName = - GlobalValue::dropLLVMManglingEscape(MF->getFunction().getName()); - MCSymbol *ParentFrameOffset = - Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName); - const MCExpr *MCOffset = - MCConstantExpr::create(FuncInfo.SEHSetFrameOffset, Ctx); - Asm->OutStreamer->EmitAssignment(ParentFrameOffset, MCOffset); + if (!isAArch64) { + // Emit a label assignment with the SEH frame offset so we can use it for + // llvm.eh.recoverfp. + StringRef FLinkageName = + GlobalValue::dropLLVMManglingEscape(MF->getFunction().getName()); + MCSymbol *ParentFrameOffset = + Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName); + const MCExpr *MCOffset = + MCConstantExpr::create(FuncInfo.SEHSetFrameOffset, Ctx); + Asm->OutStreamer->EmitAssignment(ParentFrameOffset, MCOffset); + } // Use the assembler to compute the number of table entries through label // difference and division. @@ -937,6 +939,9 @@ void WinException::emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo, if (FI != INT_MAX) { const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering(); unsigned UnusedReg; + // FIXME: getFrameIndexReference needs to match the behavior of + // AArch64RegisterInfo::hasBasePointer in which one of the scenarios where + // SP is used is if frame size >= 256. Offset = TFI->getFrameIndexReference(*Asm->MF, FI, UnusedReg); } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 871ab9b29881..bfeb3d1bc2b9 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6182,6 +6182,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { .addFrameIndex(FI); } + MF.setHasLocalEscape(true); + return nullptr; } diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp index 0724b109e1a1..8bc1f08c8875 100644 --- a/lib/MC/MCWin64EH.cpp +++ b/lib/MC/MCWin64EH.cpp @@ -453,6 +453,38 @@ static void ARM64EmitUnwindCode(MCStreamer &streamer, const MCSymbol *begin, } } +// Returns the epilog symbol of an epilog with the exact same unwind code +// sequence, if it exists. Otherwise, returns nulltpr. +// EpilogInstrs - Unwind codes for the current epilog. +// Epilogs - Epilogs that potentialy match the current epilog. +static MCSymbol* +FindMatchingEpilog(const std::vector& EpilogInstrs, + const std::vector& Epilogs, + const WinEH::FrameInfo *info) { + for (auto *EpilogStart : Epilogs) { + auto InstrsIter = info->EpilogMap.find(EpilogStart); + assert(InstrsIter != info->EpilogMap.end() && + "Epilog not found in EpilogMap"); + const auto &Instrs = InstrsIter->second; + + if (Instrs.size() != EpilogInstrs.size()) + continue; + + bool Match = true; + for (unsigned i = 0; i < Instrs.size(); ++i) + if (Instrs[i].Operation != EpilogInstrs[i].Operation || + Instrs[i].Offset != EpilogInstrs[i].Offset || + Instrs[i].Register != EpilogInstrs[i].Register) { + Match = false; + break; + } + + if (Match) + return EpilogStart; + } + return nullptr; +} + // Populate the .xdata section. The format of .xdata on ARM64 is documented at // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { @@ -477,12 +509,28 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) { // Process epilogs. MapVector EpilogInfo; + // Epilogs processed so far. + std::vector AddedEpilogs; + for (auto &I : info->EpilogMap) { MCSymbol *EpilogStart = I.first; auto &EpilogInstrs = I.second; uint32_t CodeBytes = ARM64CountOfUnwindCodes(EpilogInstrs); - EpilogInfo[EpilogStart] = TotalCodeBytes; - TotalCodeBytes += CodeBytes; + + MCSymbol* MatchingEpilog = + FindMatchingEpilog(EpilogInstrs, AddedEpilogs, info); + if (MatchingEpilog) { + assert(EpilogInfo.find(MatchingEpilog) != EpilogInfo.end() && + "Duplicate epilog not found"); + EpilogInfo[EpilogStart] = EpilogInfo[MatchingEpilog]; + // Clear the unwind codes in the EpilogMap, so that they don't get output + // in the logic below. + EpilogInstrs.clear(); + } else { + EpilogInfo[EpilogStart] = TotalCodeBytes; + TotalCodeBytes += CodeBytes; + AddedEpilogs.push_back(EpilogStart); + } } // Code Words, Epilog count, E, X, Vers, Function Length diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp index 0442076992e2..0254a572434f 100644 --- a/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -694,6 +694,34 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { switch (MI->getOpcode()) { default: break; + case AArch64::MOVMCSym: { + unsigned DestReg = MI->getOperand(0).getReg(); + const MachineOperand &MO_Sym = MI->getOperand(1); + MachineOperand Hi_MOSym(MO_Sym), Lo_MOSym(MO_Sym); + MCOperand Hi_MCSym, Lo_MCSym; + + Hi_MOSym.setTargetFlags(AArch64II::MO_G1 | AArch64II::MO_S); + Lo_MOSym.setTargetFlags(AArch64II::MO_G0 | AArch64II::MO_NC); + + MCInstLowering.lowerOperand(Hi_MOSym, Hi_MCSym); + MCInstLowering.lowerOperand(Lo_MOSym, Lo_MCSym); + + MCInst MovZ; + MovZ.setOpcode(AArch64::MOVZXi); + MovZ.addOperand(MCOperand::createReg(DestReg)); + MovZ.addOperand(Hi_MCSym); + MovZ.addOperand(MCOperand::createImm(16)); + EmitToStreamer(*OutStreamer, MovZ); + + MCInst MovK; + MovK.setOpcode(AArch64::MOVKXi); + MovK.addOperand(MCOperand::createReg(DestReg)); + MovK.addOperand(MCOperand::createReg(DestReg)); + MovK.addOperand(Lo_MCSym); + MovK.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, MovK); + return; + } case AArch64::MOVIv2d_ns: // If the target has , lower this // instruction to movi.16b instead. diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 538a8d7e8fbc..621aa8bc783a 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -228,6 +228,10 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { MFI.getMaxCallFrameSize() > DefaultSafeSPDisplacement) return true; + // Win64 SEH requires frame pointer if funclets are present. + if (MF.hasLocalEscape()) + return true; + return false; } diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index e01ca14d7f63..762f4413d72b 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2743,6 +2743,34 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_neon_umin: return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::localaddress: { + // Returns one of the stack, base, or frame pointer registers, depending on + // which is used to reference local variables. + MachineFunction &MF = DAG.getMachineFunction(); + const AArch64RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); + unsigned Reg; + if (RegInfo->hasBasePointer(MF)) + Reg = RegInfo->getBaseRegister(); + else // This function handles the SP or FP case. + Reg = RegInfo->getFrameRegister(MF); + return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, + Op.getSimpleValueType()); + } + + case Intrinsic::eh_recoverfp: { + // FIXME: This needs to be implemented to correctly handle highly aligned + // stack objects. For now we simply return the incoming FP. Refer D53541 + // for more details. + SDValue FnOp = Op.getOperand(1); + SDValue IncomingFPOp = Op.getOperand(2); + GlobalAddressSDNode *GSD = dyn_cast(FnOp); + auto *Fn = dyn_cast_or_null(GSD ? GSD->getGlobal() : nullptr); + if (!Fn) + report_fatal_error( + "llvm.eh.recoverfp must take a function as the first argument"); + return IncomingFPOp; + } } } diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index c24b8b36441b..86a4119c45f5 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -133,6 +133,10 @@ def UseNegativeImmediates : Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates", "NegativeImmediates">; +def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", + SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, + SDTCisInt<1>]>>; + //===----------------------------------------------------------------------===// // AArch64-specific DAG Nodes. @@ -6801,5 +6805,8 @@ def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)), def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)), (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>; +def MOVMCSym : Pseudo<(outs GPR64:$dst), (ins i64imm:$sym), []>, Sched<[]>; +def : Pat<(i64 (AArch64LocalRecover mcsym:$sym)), (MOVMCSym mcsym:$sym)>; + include "AArch64InstrAtomics.td" include "AArch64SVEInstrInfo.td" diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index 96ae45ae3d0d..3daac23592de 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -466,6 +466,13 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // Modify MI as necessary to handle as much of 'Offset' as possible Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg); + + if (MI.getOpcode() == TargetOpcode::LOCAL_ESCAPE) { + MachineOperand &FI = MI.getOperand(FIOperandNum); + FI.ChangeToImmediate(Offset); + return; + } + if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII)) return; diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index bb7801c172f6..55668867cc8e 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -254,7 +254,7 @@ namespace AMDGPUAS { FLAT_ADDRESS = 0, ///< Address space for flat memory. GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). - REGION_ADDRESS = 2, ///< Address space for region memory. + REGION_ADDRESS = 2, ///< Address space for region memory. (GDS) CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2) LOCAL_ADDRESS = 3, ///< Address space for local memory. diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 6951c915b177..8d36511a2830 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4192,6 +4192,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16) NODE_NAME_CASE(TBUFFER_LOAD_FORMAT) NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) + NODE_NAME_CASE(DS_ORDERED_COUNT) NODE_NAME_CASE(ATOMIC_CMP_SWAP) NODE_NAME_CASE(ATOMIC_INC) NODE_NAME_CASE(ATOMIC_DEC) diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 0d22cb2e3e20..d4a751d00a50 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -474,6 +474,7 @@ enum NodeType : unsigned { TBUFFER_STORE_FORMAT_D16, TBUFFER_LOAD_FORMAT, TBUFFER_LOAD_FORMAT_D16, + DS_ORDERED_COUNT, ATOMIC_CMP_SWAP, ATOMIC_INC, ATOMIC_DEC, diff --git a/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 9dbd7751b4d8..4d0962f65fdc 100644 --- a/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -72,6 +72,8 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; foreach intr = AMDGPUImageDimAtomicIntrinsics in def : SourceOfDivergence; diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 11e4ba4b5010..62e7e44ddb80 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -308,6 +308,8 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, switch (Inst->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td index 31d2ebef481d..9c7097e9a520 100644 --- a/lib/Target/AMDGPU/DSInstructions.td +++ b/lib/Target/AMDGPU/DSInstructions.td @@ -817,6 +817,11 @@ defm : DSAtomicRetPat_mc; defm : DSAtomicCmpXChg_mc; +def : Pat < + (SIds_ordered_count i32:$value, i16:$offset), + (DS_ORDERED_COUNT $value, (as_i16imm $offset)) +>; + //===----------------------------------------------------------------------===// // Real instructions //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index c6396de89c4f..69ddbfb53958 100644 --- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -88,14 +88,28 @@ static bool isSMovRel(unsigned Opcode) { } } -static bool isSendMsgTraceDataOrGDS(const MachineInstr &MI) { +static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII, + const MachineInstr &MI) { + if (TII.isAlwaysGDS(MI.getOpcode())) + return true; + switch (MI.getOpcode()) { case AMDGPU::S_SENDMSG: case AMDGPU::S_SENDMSGHALT: case AMDGPU::S_TTRACEDATA: return true; + // These DS opcodes don't support GDS. + case AMDGPU::DS_NOP: + case AMDGPU::DS_PERMUTE_B32: + case AMDGPU::DS_BPERMUTE_B32: + return false; default: - // TODO: GDS + if (TII.isDS(MI.getOpcode())) { + int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::gds); + if (MI.getOperand(GDS).getImm()) + return true; + } return false; } } @@ -145,7 +159,7 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { checkReadM0Hazards(MI) > 0) return NoopHazard; - if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI) && + if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) && checkReadM0Hazards(MI) > 0) return NoopHazard; @@ -199,7 +213,7 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) { isSMovRel(MI->getOpcode()))) return std::max(WaitStates, checkReadM0Hazards(MI)); - if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(*MI)) + if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) return std::max(WaitStates, checkReadM0Hazards(MI)); return WaitStates; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 0ba921647097..12113fcc1fcb 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -910,6 +910,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { @@ -937,6 +939,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { @@ -5438,6 +5442,63 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDLoc DL(Op); switch (IntrID) { + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: { + MemSDNode *M = cast(Op); + SDValue Chain = M->getOperand(0); + SDValue M0 = M->getOperand(2); + SDValue Value = M->getOperand(3); + unsigned OrderedCountIndex = M->getConstantOperandVal(7); + unsigned WaveRelease = M->getConstantOperandVal(8); + unsigned WaveDone = M->getConstantOperandVal(9); + unsigned ShaderType; + unsigned Instruction; + + switch (IntrID) { + case Intrinsic::amdgcn_ds_ordered_add: + Instruction = 0; + break; + case Intrinsic::amdgcn_ds_ordered_swap: + Instruction = 1; + break; + } + + if (WaveDone && !WaveRelease) + report_fatal_error("ds_ordered_count: wave_done requires wave_release"); + + switch (DAG.getMachineFunction().getFunction().getCallingConv()) { + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_KERNEL: + ShaderType = 0; + break; + case CallingConv::AMDGPU_PS: + ShaderType = 1; + break; + case CallingConv::AMDGPU_VS: + ShaderType = 2; + break; + case CallingConv::AMDGPU_GS: + ShaderType = 3; + break; + default: + report_fatal_error("ds_ordered_count unsupported for this calling conv"); + } + + unsigned Offset0 = OrderedCountIndex << 2; + unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | + (Instruction << 4); + unsigned Offset = Offset0 | (Offset1 << 8); + + SDValue Ops[] = { + Chain, + Value, + DAG.getTargetConstant(Offset, DL, MVT::i16), + copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue + }; + return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL, + M->getVTList(), Ops, M->getMemoryVT(), + M->getMemOperand()); + } case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_fadd: diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index afc0b4467610..3c13bccd94fa 100644 --- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -536,10 +536,13 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, CurrScore); } if (Inst.mayStore()) { - setExpScore( - &Inst, TII, TRI, MRI, - AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0), - CurrScore); + if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), + AMDGPU::OpName::data0) != -1) { + setExpScore( + &Inst, TII, TRI, MRI, + AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0), + CurrScore); + } if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data1) != -1) { setExpScore(&Inst, TII, TRI, MRI, @@ -1093,7 +1096,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst, // bracket and the destination operand scores. // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere. if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) { - if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { + if (TII->isAlwaysGDS(Inst.getOpcode()) || + TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) { ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst); ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst); } else { diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 2370d5fa7b27..7f7f1807987a 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2390,6 +2390,16 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, changesVGPRIndexingMode(MI); } +bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { + return Opcode == AMDGPU::DS_ORDERED_COUNT || + Opcode == AMDGPU::DS_GWS_INIT || + Opcode == AMDGPU::DS_GWS_SEMA_V || + Opcode == AMDGPU::DS_GWS_SEMA_BR || + Opcode == AMDGPU::DS_GWS_SEMA_P || + Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || + Opcode == AMDGPU::DS_GWS_BARRIER; +} + bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); @@ -2403,7 +2413,8 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const // EXEC = 0, but checking for that case here seems not worth it // given the typical code patterns. if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT || - Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE) + Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE || + Opcode == AMDGPU::DS_ORDERED_COUNT) return true; if (MI.isInlineAsm()) diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 5b1a05f3785e..8847fd6babb3 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -450,6 +450,8 @@ public: return get(Opcode).TSFlags & SIInstrFlags::DS; } + bool isAlwaysGDS(uint16_t Opcode) const; + static bool isMIMG(const MachineInstr &MI) { return MI.getDesc().TSFlags & SIInstrFlags::MIMG; } diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 13afa4d4974b..180a7b0601d7 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -45,6 +45,11 @@ def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD", [SDNPMayLoad, SDNPMemOperand] >; +def SIds_ordered_count : SDNode<"AMDGPUISD::DS_ORDERED_COUNT", + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i16>]>, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain, SDNPInGlue] +>; + def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2, [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] >; diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp index f39c21fc8aa2..5e9b108c2de3 100644 --- a/lib/Target/MSP430/MSP430AsmPrinter.cpp +++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp @@ -17,6 +17,7 @@ #include "MSP430InstrInfo.h" #include "MSP430MCInstLower.h" #include "MSP430TargetMachine.h" +#include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -28,6 +29,7 @@ #include "llvm/IR/Module.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/TargetRegistry.h" @@ -44,6 +46,8 @@ namespace { StringRef getPassName() const override { return "MSP430 Assembly Printer"; } + bool runOnMachineFunction(MachineFunction &MF) override; + void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O, const char* Modifier = nullptr); void printSrcMemOperand(const MachineInstr *MI, int OpNum, @@ -55,6 +59,8 @@ namespace { unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) override; void EmitInstruction(const MachineInstr *MI) override; + + void EmitInterruptVectorSection(MachineFunction &ISR); }; } // end of anonymous namespace @@ -153,6 +159,32 @@ void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, TmpInst); } +void MSP430AsmPrinter::EmitInterruptVectorSection(MachineFunction &ISR) { + MCSection *Cur = OutStreamer->getCurrentSectionOnly(); + const auto *F = &ISR.getFunction(); + assert(F->hasFnAttribute("interrupt") && + "Functions with MSP430_INTR CC should have 'interrupt' attribute"); + StringRef IVIdx = F->getFnAttribute("interrupt").getValueAsString(); + MCSection *IV = OutStreamer->getContext().getELFSection( + "__interrupt_vector_" + IVIdx, + ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHF_EXECINSTR); + OutStreamer->SwitchSection(IV); + + const MCSymbol *FunctionSymbol = getSymbol(F); + OutStreamer->EmitSymbolValue(FunctionSymbol, TM.getProgramPointerSize()); + OutStreamer->SwitchSection(Cur); +} + +bool MSP430AsmPrinter::runOnMachineFunction(MachineFunction &MF) { + // Emit separate section for an interrupt vector if ISR + if (MF.getFunction().getCallingConv() == CallingConv::MSP430_INTR) + EmitInterruptVectorSection(MF); + + SetupMachineFunction(MF); + EmitFunctionBody(); + return false; +} + // Force static initialization. extern "C" void LLVMInitializeMSP430AsmPrinter() { RegisterAsmPrinter X(getTheMSP430Target()); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b6a692ee187d..3637562c8ec3 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -27202,6 +27202,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VSHLI: return "X86ISD::VSHLI"; case X86ISD::VSRLI: return "X86ISD::VSRLI"; case X86ISD::VSRAI: return "X86ISD::VSRAI"; + case X86ISD::VSHLV: return "X86ISD::VSHLV"; + case X86ISD::VSRLV: return "X86ISD::VSRLV"; case X86ISD::VSRAV: return "X86ISD::VSRAV"; case X86ISD::VROTLI: return "X86ISD::VROTLI"; case X86ISD::VROTRI: return "X86ISD::VROTRI"; diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 910acd80e8b8..66d5d43946a2 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -315,10 +315,8 @@ namespace llvm { // Vector shift elements VSHL, VSRL, VSRA, - // Vector variable shift right arithmetic. - // Unlike ISD::SRA, in case shift count greater then element size - // use sign bit to fill destination data element. - VSRAV, + // Vector variable shift + VSHLV, VSRLV, VSRAV, // Vector shift elements by immediate VSHLI, VSRLI, VSRAI, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 7423cb85acd2..85676f102be0 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -6445,52 +6445,53 @@ defm : avx512_var_shift_lowering; // Special handing for handling VPSRAV intrinsics. -multiclass avx512_var_shift_int_lowering p> { +multiclass avx512_var_shift_int_lowering p> { let Predicates = p in { - def : Pat<(_.VT (X86vsrav _.RC:$src1, _.RC:$src2)), + def : Pat<(_.VT (OpNode _.RC:$src1, _.RC:$src2)), (!cast(InstrStr#_.ZSuffix#rr) _.RC:$src1, _.RC:$src2)>; - def : Pat<(_.VT (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2))), + def : Pat<(_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src2))), (!cast(InstrStr#_.ZSuffix##rm) _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86vsrav _.RC:$src1, _.RC:$src2), _.RC:$src0)), + (OpNode _.RC:$src1, _.RC:$src2), _.RC:$src0)), (!cast(InstrStr#_.ZSuffix#rrk) _.RC:$src0, _.KRC:$mask, _.RC:$src1, _.RC:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)), + (OpNode _.RC:$src1, (_.LdFrag addr:$src2)), _.RC:$src0)), (!cast(InstrStr#_.ZSuffix##rmk) _.RC:$src0, _.KRC:$mask, _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86vsrav _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)), + (OpNode _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)), (!cast(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask, _.RC:$src1, _.RC:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86vsrav _.RC:$src1, (_.LdFrag addr:$src2)), + (OpNode _.RC:$src1, (_.LdFrag addr:$src2)), _.ImmAllZerosV)), (!cast(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask, _.RC:$src1, addr:$src2)>; } } -multiclass avx512_var_shift_int_lowering_mb p> : - avx512_var_shift_int_lowering { +multiclass avx512_var_shift_int_lowering_mb p> : + avx512_var_shift_int_lowering { let Predicates = p in { - def : Pat<(_.VT (X86vsrav _.RC:$src1, + def : Pat<(_.VT (OpNode _.RC:$src1, (X86VBroadcast (_.ScalarLdFrag addr:$src2)))), (!cast(InstrStr#_.ZSuffix##rmb) _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86vsrav _.RC:$src1, + (OpNode _.RC:$src1, (X86VBroadcast (_.ScalarLdFrag addr:$src2))), _.RC:$src0)), (!cast(InstrStr#_.ZSuffix##rmbk) _.RC:$src0, _.KRC:$mask, _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, - (X86vsrav _.RC:$src1, + (OpNode _.RC:$src1, (X86VBroadcast (_.ScalarLdFrag addr:$src2))), _.ImmAllZerosV)), (!cast(InstrStr#_.ZSuffix##rmbkz) _.KRC:$mask, @@ -6498,15 +6499,47 @@ multiclass avx512_var_shift_int_lowering_mb; -defm : avx512_var_shift_int_lowering<"VPSRAVW", v16i16x_info, [HasVLX, HasBWI]>; -defm : avx512_var_shift_int_lowering<"VPSRAVW", v32i16_info, [HasBWI]>; -defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v4i32x_info, [HasVLX]>; -defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v8i32x_info, [HasVLX]>; -defm : avx512_var_shift_int_lowering_mb<"VPSRAVD", v16i32_info, [HasAVX512]>; -defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>; -defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>; -defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>; +multiclass avx512_var_shift_int_lowering_vl { + defm : avx512_var_shift_int_lowering; + defm : avx512_var_shift_int_lowering; + defm : avx512_var_shift_int_lowering; +} + +multiclass avx512_var_shift_int_lowering_mb_vl { + defm : avx512_var_shift_int_lowering_mb; + defm : avx512_var_shift_int_lowering_mb; + defm : avx512_var_shift_int_lowering_mb; +} + +defm : avx512_var_shift_int_lowering_vl<"VPSRAVW", X86vsrav, avx512vl_i16_info, + HasBWI>; +defm : avx512_var_shift_int_lowering_mb_vl<"VPSRAVD", X86vsrav, + avx512vl_i32_info, HasAVX512>; +defm : avx512_var_shift_int_lowering_mb_vl<"VPSRAVQ", X86vsrav, + avx512vl_i64_info, HasAVX512>; + +defm : avx512_var_shift_int_lowering_vl<"VPSRLVW", X86vsrlv, avx512vl_i16_info, + HasBWI>; +defm : avx512_var_shift_int_lowering_mb_vl<"VPSRLVD", X86vsrlv, + avx512vl_i32_info, HasAVX512>; +defm : avx512_var_shift_int_lowering_mb_vl<"VPSRLVQ", X86vsrlv, + avx512vl_i64_info, HasAVX512>; + +defm : avx512_var_shift_int_lowering_vl<"VPSLLVW", X86vshlv, avx512vl_i16_info, + HasBWI>; +defm : avx512_var_shift_int_lowering_mb_vl<"VPSLLVD", X86vshlv, + avx512vl_i32_info, HasAVX512>; +defm : avx512_var_shift_int_lowering_mb_vl<"VPSLLVQ", X86vshlv, + avx512vl_i64_info, HasAVX512>; + // Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. let Predicates = [HasAVX512, NoVLX] in { diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 11a27ba90586..3d508e2c34f3 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -198,6 +198,8 @@ def X86vsra : SDNode<"X86ISD::VSRA", X86vshiftuniform>; def X86vshiftvariable : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<0>]>; +def X86vshlv : SDNode<"X86ISD::VSHLV", X86vshiftvariable>; +def X86vsrlv : SDNode<"X86ISD::VSRLV", X86vshiftvariable>; def X86vsrav : SDNode<"X86ISD::VSRAV", X86vshiftvariable>; def X86vshli : SDNode<"X86ISD::VSHLI", X86vshiftimm>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index e2bcd18ce660..ddfc369b1180 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -8318,7 +8318,7 @@ def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))), // Variable Bit Shifts // multiclass avx2_var_shift opc, string OpcodeStr, SDNode OpNode, - ValueType vt128, ValueType vt256> { + SDNode IntrinNode, ValueType vt128, ValueType vt256> { def rr : AVX28I opc, string OpcodeStr, SDNode OpNode, (vt256 (load addr:$src2)))))]>, VEX_4V, VEX_L, Sched<[SchedWriteVarVecShift.YMM.Folded, SchedWriteVarVecShift.YMM.ReadAfterFold]>; + + def : Pat<(vt128 (IntrinNode VR128:$src1, VR128:$src2)), + (!cast(NAME#"rr") VR128:$src1, VR128:$src2)>; + def : Pat<(vt128 (IntrinNode VR128:$src1, (load addr:$src2))), + (!cast(NAME#"rm") VR128:$src1, addr:$src2)>; + def : Pat<(vt256 (IntrinNode VR256:$src1, VR256:$src2)), + (!cast(NAME#"Yrr") VR256:$src1, VR256:$src2)>; + def : Pat<(vt256 (IntrinNode VR256:$src1, (load addr:$src2))), + (!cast(NAME#"Yrm") VR256:$src1, addr:$src2)>; } let Predicates = [HasAVX2, NoVLX] in { - defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; - defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; - defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; - defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; - defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; - - def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)), - (VPSRAVDrr VR128:$src1, VR128:$src2)>; - def : Pat<(v4i32 (X86vsrav VR128:$src1, (load addr:$src2))), - (VPSRAVDrm VR128:$src1, addr:$src2)>; - def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)), - (VPSRAVDYrr VR256:$src1, VR256:$src2)>; - def : Pat<(v8i32 (X86vsrav VR256:$src1, (load addr:$src2))), - (VPSRAVDYrm VR256:$src1, addr:$src2)>; + defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, X86vshlv, v4i32, v8i32>; + defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, X86vshlv, v2i64, v4i64>, VEX_W; + defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, X86vsrlv, v4i32, v8i32>; + defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, X86vsrlv, v2i64, v4i64>, VEX_W; + defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, X86vsrav, v4i32, v8i32>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 151e1b9136c4..acb3d48463de 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -389,10 +389,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_pslli_d, VSHIFT, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx2_pslli_q, VSHIFT, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx2_pslli_w, VSHIFT, X86ISD::VSHLI, 0), - X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, X86ISD::VSHLV, 0), + X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, X86ISD::VSHLV, 0), + X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, X86ISD::VSHLV, 0), + X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, X86ISD::VSHLV, 0), X86_INTRINSIC_DATA(avx2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0), X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0), X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0), @@ -405,10 +405,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_psrli_d, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx2_psrli_q, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx2_psrli_w, VSHIFT, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, X86ISD::VSRLV, 0), + X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0), + X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, X86ISD::VSRLV, 0), + X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0), X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0), @@ -943,11 +943,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_pslli_d_512, VSHIFT, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx512_pslli_q_512, VSHIFT, X86ISD::VSHLI, 0), X86_INTRINSIC_DATA(avx512_pslli_w_512, VSHIFT, X86ISD::VSHLI, 0), - X86_INTRINSIC_DATA(avx512_psllv_d_512, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_psllv_q_512, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_psllv_w_128, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_psllv_w_256, INTR_TYPE_2OP, ISD::SHL, 0), - X86_INTRINSIC_DATA(avx512_psllv_w_512, INTR_TYPE_2OP, ISD::SHL, 0), + X86_INTRINSIC_DATA(avx512_psllv_d_512, INTR_TYPE_2OP, X86ISD::VSHLV, 0), + X86_INTRINSIC_DATA(avx512_psllv_q_512, INTR_TYPE_2OP, X86ISD::VSHLV, 0), + X86_INTRINSIC_DATA(avx512_psllv_w_128, INTR_TYPE_2OP, X86ISD::VSHLV, 0), + X86_INTRINSIC_DATA(avx512_psllv_w_256, INTR_TYPE_2OP, X86ISD::VSHLV, 0), + X86_INTRINSIC_DATA(avx512_psllv_w_512, INTR_TYPE_2OP, X86ISD::VSHLV, 0), X86_INTRINSIC_DATA(avx512_psra_d_512, INTR_TYPE_2OP, X86ISD::VSRA, 0), X86_INTRINSIC_DATA(avx512_psra_q_128, INTR_TYPE_2OP, X86ISD::VSRA, 0), X86_INTRINSIC_DATA(avx512_psra_q_256, INTR_TYPE_2OP, X86ISD::VSRA, 0), @@ -971,11 +971,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_psrli_d_512, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_psrli_q_512, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_psrli_w_512, VSHIFT, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(avx512_psrlv_d_512, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_psrlv_q_512, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0), + X86_INTRINSIC_DATA(avx512_psrlv_d_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0), + X86_INTRINSIC_DATA(avx512_psrlv_q_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0), + X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, X86ISD::VSRLV, 0), + X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, X86ISD::VSRLV, 0), + X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, X86ISD::VSRLV, 0), X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index be7d43bbcf2c..f530ee1246e8 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -3065,9 +3065,11 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { I->isTerminator()) return false; - // Do not sink alloca instructions out of the entry block. - if (isa(I) && I->getParent() == - &DestBlock->getParent()->getEntryBlock()) + // Do not sink static or dynamic alloca instructions. Static allocas must + // remain in the entry block, and dynamic allocas must not be sunk in between + // a stacksave / stackrestore pair, which would incorrectly shorten its + // lifetime. + if (isa(I)) return false; // Do not sink into catchswitch blocks. diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index eab77cf4cda9..68ca6c47c8f1 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -3031,7 +3031,10 @@ private: ConstantInt *Size = ConstantInt::get(cast(II.getArgOperand(0)->getType()), NewEndOffset - NewBeginOffset); - Value *Ptr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); + // Lifetime intrinsics always expect an i8* so directly get such a pointer + // for the new alloca slice. + Type *PointerTy = IRB.getInt8PtrTy(OldPtr->getType()->getPointerAddressSpace()); + Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy); Value *New; if (II.getIntrinsicID() == Intrinsic::lifetime_start) New = IRB.CreateLifetimeStart(Ptr, Size); diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 2e856a7e6802..a07fffe9b98b 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1468,8 +1468,9 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // If any of the scalars is marked as a value that needs to stay scalar, then // we need to gather the scalars. + // The reduction nodes (stored in UserIgnoreList) also should stay scalar. for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (MustGather.count(VL[i])) { + if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); newTreeEntry(VL, false, UserTreeIdx); return; diff --git a/test/CodeGen/AArch64/seh-finally.ll b/test/CodeGen/AArch64/seh-finally.ll new file mode 100644 index 000000000000..3cbbd03385c5 --- /dev/null +++ b/test/CodeGen/AArch64/seh-finally.ll @@ -0,0 +1,67 @@ +; RUN: llc -mtriple arm64-windows -o - %s | FileCheck %s + +; Function Attrs: noinline optnone uwtable +define dso_local i32 @foo() { +entry: +; CHECK-LABEL: foo +; CHECK: orr w8, wzr, #0x1 +; CHECK: mov w0, wzr +; CHECK: mov x1, x29 +; CHECK: .set .Lfoo$frame_escape_0, -4 +; CHECK: stur w8, [x29, #-4] +; CHECK: bl "?fin$0@0@foo@@" +; CHECK: ldur w0, [x29, #-4] + + %count = alloca i32, align 4 + call void (...) @llvm.localescape(i32* %count) + store i32 0, i32* %count, align 4 + %0 = load i32, i32* %count, align 4 + %add = add nsw i32 %0, 1 + store i32 %add, i32* %count, align 4 + %1 = call i8* @llvm.localaddress() + call void @"?fin$0@0@foo@@"(i8 0, i8* %1) + %2 = load i32, i32* %count, align 4 + ret i32 %2 +} + +define internal void @"?fin$0@0@foo@@"(i8 %abnormal_termination, i8* %frame_pointer) { +entry: +; CHECK-LABEL: @"?fin$0@0@foo@@" +; CHECK: sub sp, sp, #16 +; CHECK: str x1, [sp, #8] +; CHECK: strb w0, [sp, #7] +; CHECK: movz x8, #:abs_g1_s:.Lfoo$frame_escape_0 +; CHECK: movk x8, #:abs_g0_nc:.Lfoo$frame_escape_0 +; CHECK: add x8, x1, x8 +; CHECK: ldr w9, [x8] +; CHECK: add w9, w9, #1 +; CHECK: str w9, [x8] + + %frame_pointer.addr = alloca i8*, align 8 + %abnormal_termination.addr = alloca i8, align 1 + %0 = call i8* @llvm.localrecover(i8* bitcast (i32 ()* @foo to i8*), i8* %frame_pointer, i32 0) + %count = bitcast i8* %0 to i32* + store i8* %frame_pointer, i8** %frame_pointer.addr, align 8 + store i8 %abnormal_termination, i8* %abnormal_termination.addr, align 1 + %1 = zext i8 %abnormal_termination to i32 + %cmp = icmp eq i32 %1, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %2 = load i32, i32* %count, align 4 + %add = add nsw i32 %2, 1 + store i32 %add, i32* %count, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; Function Attrs: nounwind readnone +declare i8* @llvm.localrecover(i8*, i8*, i32) + +; Function Attrs: nounwind readnone +declare i8* @llvm.localaddress() + +; Function Attrs: nounwind +declare void @llvm.localescape(...) diff --git a/test/CodeGen/AArch64/seh-localescape.ll b/test/CodeGen/AArch64/seh-localescape.ll new file mode 100644 index 000000000000..0a1675014f62 --- /dev/null +++ b/test/CodeGen/AArch64/seh-localescape.ll @@ -0,0 +1,30 @@ +; RUN: llc -mtriple arm64-windows %s -o - | FileCheck %s + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @foo() { +entry: +; CHECK-LABEL: foo +; CHECK: .set .Lfoo$frame_escape_0, -4 + + %count = alloca i32, align 4 + call void (...) @llvm.localescape(i32* %count) + ret i32 0 +} + +define internal i32 @"?filt$0@0@foo@@"(i8* %exception_pointers, i8* %frame_pointer) { +entry: +; CHECK-LABEL: @"?filt$0@0@foo@@" +; CHECK: movz x8, #:abs_g1_s:.Lfoo$frame_escape_0 +; CHECK: movk x8, #:abs_g0_nc:.Lfoo$frame_escape_0 + + %0 = call i8* @llvm.localrecover(i8* bitcast (i32 ()* @foo to i8*), i8* %frame_pointer, i32 0) + %count = bitcast i8* %0 to i32* + %1 = load i32, i32* %count, align 4 + ret i32 %1 +} + +; Function Attrs: nounwind readnone +declare i8* @llvm.localrecover(i8*, i8*, i32) #2 + +; Function Attrs: nounwind +declare void @llvm.localescape(...) #3 diff --git a/test/CodeGen/AArch64/wineh4.mir b/test/CodeGen/AArch64/wineh4.mir index 4d4cc892c2e8..3a324324431c 100644 --- a/test/CodeGen/AArch64/wineh4.mir +++ b/test/CodeGen/AArch64/wineh4.mir @@ -1,7 +1,7 @@ # RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \ # RUN: -disable-branch-fold -filetype=obj \ # RUN: | llvm-readobj -unwind | FileCheck %s -# Check that multiple epilgoues are correctly placed in .xdata. +# Check that identical multiple epilgoues are correctly shared in .xdata. # CHECK: ExceptionData { # CHECK-NEXT: FunctionLength: 164 @@ -9,7 +9,7 @@ # CHECK-NEXT: ExceptionData: No # CHECK-NEXT: EpiloguePacked: No # CHECK-NEXT: EpilogueScopes: 2 -# CHECK-NEXT: ByteCodeLength: 48 +# CHECK-NEXT: ByteCodeLength: 32 # CHECK-NEXT: Prologue [ # CHECK-NEXT: 0xc80c ; stp x19, x20, [sp, #96] # CHECK-NEXT: 0xc88a ; stp x21, x22, [sp, #80] @@ -37,7 +37,7 @@ # CHECK-NEXT: } # CHECK-NEXT: EpilogueScope { # CHECK-NEXT: StartOffset: 33 -# CHECK-NEXT: EpilogueStartIndex: 30 +# CHECK-NEXT: EpilogueStartIndex: 15 # CHECK-NEXT: Opcodes [ # CHECK-NEXT: 0xc80c ; ldp x19, x20, [sp, #96] # CHECK-NEXT: 0xc88a ; ldp x21, x22, [sp, #80] diff --git a/test/CodeGen/AArch64/wineh8.mir b/test/CodeGen/AArch64/wineh8.mir new file mode 100644 index 000000000000..606bc140b232 --- /dev/null +++ b/test/CodeGen/AArch64/wineh8.mir @@ -0,0 +1,225 @@ +# RUN: llc -o - %s -mtriple=aarch64-windows -start-after=prologepilog \ +# RUN: -disable-branch-fold -filetype=obj \ +# RUN: | llvm-readobj -unwind | FileCheck %s +# Check that non-identical multiple epilgoues are correctly shared in .xdata. + +# CHECK: ExceptionData { +# CHECK-NEXT: FunctionLength: 160 +# CHECK-NEXT: Version: 0 +# CHECK-NEXT: ExceptionData: No +# CHECK-NEXT: EpiloguePacked: No +# CHECK-NEXT: EpilogueScopes: 2 +# CHECK-NEXT: ByteCodeLength: 44 +# CHECK-NEXT: Prologue [ +# CHECK-NEXT: 0xc80c ; stp x19, x20, [sp, #96] +# CHECK-NEXT: 0xc88a ; stp x21, x22, [sp, #80] +# CHECK-NEXT: 0xc908 ; stp x23, x24, [sp, #64] +# CHECK-NEXT: 0xc986 ; stp x25, x26, [sp, #48] +# CHECK-NEXT: 0xca04 ; stp x27, x28, [sp, #32] +# CHECK-NEXT: 0xd802 ; stp d8, d9, [sp, #16] +# CHECK-NEXT: 0xda8d ; stp d10, d11, [sp, #-112]! +# CHECK-NEXT: 0xe4 ; end +# CHECK-NEXT: ] +# CHECK-NEXT: EpilogueScopes [ +# CHECK-NEXT: EpilogueScope { +# CHECK-NEXT: StartOffset: 16 +# CHECK-NEXT: EpilogueStartIndex: 15 +# CHECK-NEXT: Opcodes [ +# CHECK-NEXT: 0xc80c ; ldp x19, x20, [sp, #96] +# CHECK-NEXT: 0xc88a ; ldp x21, x22, [sp, #80] +# CHECK-NEXT: 0xc908 ; ldp x23, x24, [sp, #64] +# CHECK-NEXT: 0xc986 ; ldp x25, x26, [sp, #48] +# CHECK-NEXT: 0xd802 ; ldp d8, d9, [sp, #16] +# CHECK-NEXT: 0xda8d ; ldp d10, d11, [sp], #112 +# CHECK-NEXT: 0xe4 ; end +# CHECK-NEXT: ] +# CHECK-NEXT: } +# CHECK-NEXT: EpilogueScope { +# CHECK-NEXT: StartOffset: 32 +# CHECK-NEXT: EpilogueStartIndex: 28 +# CHECK-NEXT: Opcodes [ +# CHECK-NEXT: 0xc80c ; ldp x19, x20, [sp, #96] +# CHECK-NEXT: 0xc88a ; ldp x21, x22, [sp, #80] +# CHECK-NEXT: 0xc908 ; ldp x23, x24, [sp, #64] +# CHECK-NEXT: 0xc986 ; ldp x25, x26, [sp, #48] +# CHECK-NEXT: 0xca04 ; ldp x27, x28, [sp, #32] +# CHECK-NEXT: 0xd802 ; ldp d8, d9, [sp, #16] +# CHECK-NEXT: 0xda8d ; ldp d10, d11, [sp], #112 +# CHECK-NEXT: 0xe4 ; end +# CHECK-NEXT: ] +# CHECK-NEXT: } +# CHECK-NEXT: ] +# CHECK-NEXT: } +... +--- +name: test +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: true +registers: +liveins: + - { reg: '$w0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 112 + offsetAdjustment: 0 + maxAlignment: 8 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: true + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: +stack: + - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '$x19', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '$x20', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: spill-slot, offset: -24, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '$x21', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: spill-slot, offset: -32, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '$x22', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 4, name: '', type: spill-slot, offset: -40, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '$x23', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 5, name: '', type: spill-slot, offset: -48, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '$x24', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 6, name: '', type: spill-slot, offset: -56, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '$x25', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 7, name: '', type: spill-slot, offset: -64, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '$x26', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 8, name: '', type: spill-slot, offset: -72, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '$x27', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 9, name: '', type: spill-slot, offset: -80, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '$x28', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 10, name: '', type: spill-slot, offset: -88, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '$d8', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 11, name: '', type: spill-slot, offset: -96, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '$d9', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 12, name: '', type: spill-slot, offset: -104, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '$d10', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 13, name: '', type: spill-slot, offset: -112, size: 8, alignment: 8, + stack-id: 0, callee-saved-register: '$d11', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +constants: +body: | + bb.0.entry: + successors: %bb.2(0x40000000), %bb.1(0x40000000) + liveins: $x0, $x1, $d0, $d1, $d10, $d11, $d8, $d9, $x27, $x28, $x25, $x26, $x23, $x24, $x21, $x22, $x19, $x20 + + early-clobber $sp = frame-setup STPDpre killed $d10, killed $d11, $sp, -14 :: (store 8 into %stack.12), (store 8 into %stack.13) + frame-setup SEH_SaveFRegP_X 10, 11, -112 + frame-setup STPDi killed $d8, killed $d9, $sp, 2 :: (store 8 into %stack.10), (store 8 into %stack.11) + frame-setup SEH_SaveFRegP 8, 9, 16 + frame-setup STPXi killed $x27, killed $x28, $sp, 4 :: (store 8 into %stack.8), (store 8 into %stack.9) + frame-setup SEH_SaveRegP 27, 28, 32 + frame-setup STPXi killed $x25, killed $x26, $sp, 6 :: (store 8 into %stack.6), (store 8 into %stack.7) + frame-setup SEH_SaveRegP 25, 26, 48 + frame-setup STPXi killed $x23, killed $x24, $sp, 8 :: (store 8 into %stack.4), (store 8 into %stack.5) + frame-setup SEH_SaveRegP 23, 24, 64 + frame-setup STPXi killed $x21, killed $x22, $sp, 10 :: (store 8 into %stack.2), (store 8 into %stack.3) + frame-setup SEH_SaveRegP 21, 22, 80 + frame-setup STPXi killed $x19, killed $x20, $sp, 12 :: (store 8 into %stack.0), (store 8 into %stack.1) + frame-setup SEH_SaveRegP 19, 20, 96 + frame-setup SEH_PrologEnd + frame-setup CFI_INSTRUCTION def_cfa_offset 112 + frame-setup CFI_INSTRUCTION offset $w19, -8 + frame-setup CFI_INSTRUCTION offset $w20, -16 + frame-setup CFI_INSTRUCTION offset $w21, -24 + frame-setup CFI_INSTRUCTION offset $w22, -32 + frame-setup CFI_INSTRUCTION offset $w23, -40 + frame-setup CFI_INSTRUCTION offset $w24, -48 + frame-setup CFI_INSTRUCTION offset $w25, -56 + frame-setup CFI_INSTRUCTION offset $w26, -64 + frame-setup CFI_INSTRUCTION offset $w27, -72 + frame-setup CFI_INSTRUCTION offset $w28, -80 + frame-setup CFI_INSTRUCTION offset $b8, -88 + frame-setup CFI_INSTRUCTION offset $b9, -96 + frame-setup CFI_INSTRUCTION offset $b10, -104 + frame-setup CFI_INSTRUCTION offset $b11, -112 + $x19 = ADDXrr $x0, killed $x1 + $d8 = FADDDrr killed $d0, $d1 + $d9 = FADDDrr $d8, $d1 + $d10 = FADDDrr $d9, $d8 + $d11 = FADDDrr killed $d9, $d10 + $x20 = SUBSXrr $x19, killed $x0, implicit-def $nzcv + Bcc 1, %bb.2, implicit killed $nzcv + B %bb.1 + + bb.1: + liveins: $x19, $x20 + + $x21 = ADDXrr $x20, killed $x19 + $x22 = ADDXrr $x21, killed $x20 + $x23 = ADDXrr $x22, killed $x21 + $x24 = ADDXrr $x23, killed $x22 + $x25 = ADDXrr $x24, killed $x23 + $x26 = ADDXrr $x25, killed $x24 + $x27 = ADDXrr $x26, killed $x25 + $x28 = ADDXrr $x27, killed $x26 + $x0 = COPY $x28 + frame-destroy SEH_EpilogStart + $x19, $x20 = frame-destroy LDPXi $sp, 12 :: (load 8 from %stack.0), (load 8 from %stack.1) + frame-destroy SEH_SaveRegP 19, 20, 96 + $x21, $x22 = frame-destroy LDPXi $sp, 10 :: (load 8 from %stack.2), (load 8 from %stack.3) + frame-destroy SEH_SaveRegP 21, 22, 80 + $x23, $x24 = frame-destroy LDPXi $sp, 8 :: (load 8 from %stack.4), (load 8 from %stack.5) + frame-destroy SEH_SaveRegP 23, 24, 64 + $x25, $x26 = frame-destroy LDPXi $sp, 6 :: (load 8 from %stack.6), (load 8 from %stack.7) + frame-destroy SEH_SaveRegP 25, 26, 48 + $x27, $x28 = frame-destroy LDPXi $sp, 4 :: (load 8 from %stack.8), (load 8 from %stack.9) + frame-destroy SEH_SaveRegP 27, 28, 32 + $d8, $d9 = frame-destroy LDPDi $sp, 2 :: (load 8 from %stack.10), (load 8 from %stack.11) + frame-destroy SEH_SaveFRegP 8, 9, 16 + early-clobber $sp, $d10, $d11 = frame-destroy LDPDpost $sp, 14 :: (load 8 from %stack.12), (load 8 from %stack.13) + frame-destroy SEH_SaveFRegP_X 10, 11, -112 + frame-destroy SEH_EpilogEnd + RET_ReallyLR implicit $x0 + + bb.2: + liveins: $x28, $d11 + + $x0 = COPY $d11 + $x0 = ADDXrr $x0, killed $x28 + frame-destroy SEH_EpilogStart + $x19, $x20 = frame-destroy LDPXi $sp, 12 :: (load 8 from %stack.0), (load 8 from %stack.1) + frame-destroy SEH_SaveRegP 19, 20, 96 + $x21, $x22 = frame-destroy LDPXi $sp, 10 :: (load 8 from %stack.2), (load 8 from %stack.3) + frame-destroy SEH_SaveRegP 21, 22, 80 + $x23, $x24 = frame-destroy LDPXi $sp, 8 :: (load 8 from %stack.4), (load 8 from %stack.5) + frame-destroy SEH_SaveRegP 23, 24, 64 + $x25, $x26 = frame-destroy LDPXi $sp, 6 :: (load 8 from %stack.6), (load 8 from %stack.7) + frame-destroy SEH_SaveRegP 25, 26, 48 + $d8, $d9 = frame-destroy LDPDi $sp, 2 :: (load 8 from %stack.10), (load 8 from %stack.11) + frame-destroy SEH_SaveFRegP 8, 9, 16 + early-clobber $sp, $d10, $d11 = frame-destroy LDPDpost $sp, 14 :: (load 8 from %stack.12), (load 8 from %stack.13) + frame-destroy SEH_SaveFRegP_X 10, 11, -112 + frame-destroy SEH_EpilogEnd + RET_ReallyLR implicit $x0 + +... diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll new file mode 100644 index 000000000000..ad489debc46c --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.ll @@ -0,0 +1,96 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s + +; FUNC-LABEL: {{^}}ds_ordered_add: +; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31 +; GCN-DAG: s_mov_b32 m0, +; GCN: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds +define amdgpu_kernel void @ds_ordered_add(i32 addrspace(2)* inreg %gds, i32 addrspace(1)* %out) { + %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; Below are various modifications of input operands and shader types. + +; FUNC-LABEL: {{^}}ds_ordered_add_counter2: +; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31 +; GCN-DAG: s_mov_b32 m0, +; GCN: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:776 gds +define amdgpu_kernel void @ds_ordered_add_counter2(i32 addrspace(2)* inreg %gds, i32 addrspace(1)* %out) { + %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 2, i1 true, i1 true) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ds_ordered_add_nodone: +; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31 +; GCN-DAG: s_mov_b32 m0, +; GCN: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:260 gds +define amdgpu_kernel void @ds_ordered_add_nodone(i32 addrspace(2)* inreg %gds, i32 addrspace(1)* %out) { + %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ds_ordered_add_norelease: +; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31 +; GCN-DAG: s_mov_b32 m0, +; GCN: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:4 gds +define amdgpu_kernel void @ds_ordered_add_norelease(i32 addrspace(2)* inreg %gds, i32 addrspace(1)* %out) { + %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 false, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}ds_ordered_add_cs: +; GCN: v_mov_b32_e32 v[[INCR:[0-9]+]], 31 +; GCN: s_mov_b32 m0, s0 +; VIGFX9-NEXT: s_nop 0 +; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:772 gds +; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) +define amdgpu_cs float @ds_ordered_add_cs(i32 addrspace(2)* inreg %gds) { + %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true) + %r = bitcast i32 %val to float + ret float %r +} + +; FUNC-LABEL: {{^}}ds_ordered_add_ps: +; GCN: v_mov_b32_e32 v[[INCR:[0-9]+]], 31 +; GCN: s_mov_b32 m0, s0 +; VIGFX9-NEXT: s_nop 0 +; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:1796 gds +; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) +define amdgpu_ps float @ds_ordered_add_ps(i32 addrspace(2)* inreg %gds) { + %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true) + %r = bitcast i32 %val to float + ret float %r +} + +; FUNC-LABEL: {{^}}ds_ordered_add_vs: +; GCN: v_mov_b32_e32 v[[INCR:[0-9]+]], 31 +; GCN: s_mov_b32 m0, s0 +; VIGFX9-NEXT: s_nop 0 +; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:2820 gds +; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) +define amdgpu_vs float @ds_ordered_add_vs(i32 addrspace(2)* inreg %gds) { + %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true) + %r = bitcast i32 %val to float + ret float %r +} + +; FUNC-LABEL: {{^}}ds_ordered_add_gs: +; GCN: v_mov_b32_e32 v[[INCR:[0-9]+]], 31 +; GCN: s_mov_b32 m0, s0 +; VIGFX9-NEXT: s_nop 0 +; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v[[INCR]] offset:3844 gds +; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) +define amdgpu_gs float @ds_ordered_add_gs(i32 addrspace(2)* inreg %gds) { + %val = call i32@llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* %gds, i32 31, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true) + %r = bitcast i32 %val to float + ret float %r +} + +declare i32 @llvm.amdgcn.ds.ordered.add(i32 addrspace(2)* nocapture, i32, i32, i32, i1, i32, i1, i1) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll new file mode 100644 index 000000000000..acb1133c6a0b --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.swap.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VIGFX9,FUNC %s + +; FUNC-LABEL: {{^}}ds_ordered_swap: +; GCN: s_mov_b32 m0, s0 +; VIGFX9-NEXT: s_nop 0 +; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v0 offset:4868 gds +; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0) +define amdgpu_cs float @ds_ordered_swap(i32 addrspace(2)* inreg %gds, i32 %value) { + %val = call i32@llvm.amdgcn.ds.ordered.swap(i32 addrspace(2)* %gds, i32 %value, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true) + %r = bitcast i32 %val to float + ret float %r +} + +; FUNC-LABEL: {{^}}ds_ordered_swap_conditional: +; GCN: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN: s_and_saveexec_b64 s[[SAVED:\[[0-9]+:[0-9]+\]]], vcc +; // We have to use s_cbranch, because ds_ordered_count has side effects with EXEC=0 +; GCN: s_cbranch_execz [[BB:BB._.]] +; GCN: s_mov_b32 m0, s0 +; VIGFX9-NEXT: s_nop 0 +; GCN-NEXT: ds_ordered_count v{{[0-9]+}}, v0 offset:4868 gds +; GCN-NEXT: [[BB]]: +; // Wait for expcnt(0) before modifying EXEC +; GCN-NEXT: s_waitcnt expcnt(0) +; GCN-NEXT: s_or_b64 exec, exec, s[[SAVED]] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +define amdgpu_cs float @ds_ordered_swap_conditional(i32 addrspace(2)* inreg %gds, i32 %value) { +entry: + %c = icmp ne i32 %value, 0 + br i1 %c, label %if-true, label %endif + +if-true: + %val = call i32@llvm.amdgcn.ds.ordered.swap(i32 addrspace(2)* %gds, i32 %value, i32 0, i32 0, i1 false, i32 1, i1 true, i1 true) + br label %endif + +endif: + %v = phi i32 [ %val, %if-true ], [ undef, %entry ] + %r = bitcast i32 %v to float + ret float %r +} + +declare i32 @llvm.amdgcn.ds.ordered.swap(i32 addrspace(2)* nocapture, i32, i32, i32, i1, i32, i1, i1) diff --git a/test/CodeGen/MSP430/2009-12-21-FrameAddr.ll b/test/CodeGen/MSP430/2009-12-21-FrameAddr.ll index c3d69c7c0db5..be82e98dffe3 100644 --- a/test/CodeGen/MSP430/2009-12-21-FrameAddr.ll +++ b/test/CodeGen/MSP430/2009-12-21-FrameAddr.ll @@ -3,7 +3,7 @@ target datalayout = "e-p:16:8:8-i8:8:8-i16:8:8-i32:8:8" target triple = "msp430-unknown-linux-gnu" -define msp430_intrcc void @foo() nounwind { +define msp430_intrcc void @foo() nounwind #0 { entry: %fa = call i8* @llvm.frameaddress(i32 0) store i8 0, i8* %fa @@ -11,3 +11,5 @@ entry: } declare i8* @llvm.frameaddress(i32) + +attributes #0 = { noinline nounwind optnone "interrupt"="2" } diff --git a/test/CodeGen/MSP430/fp.ll b/test/CodeGen/MSP430/fp.ll index e7d7c519657e..bf603704a91b 100644 --- a/test/CodeGen/MSP430/fp.ll +++ b/test/CodeGen/MSP430/fp.ll @@ -27,3 +27,5 @@ define msp430_intrcc void @fpb_alloced() #0 { call void asm sideeffect "nop", "r"(i8 0) ret void } + +attributes #0 = { noinline nounwind optnone "interrupt"="2" } diff --git a/test/CodeGen/MSP430/interrupt.ll b/test/CodeGen/MSP430/interrupt.ll index 5fa0c849c260..94fb3bc457a3 100644 --- a/test/CodeGen/MSP430/interrupt.ll +++ b/test/CodeGen/MSP430/interrupt.ll @@ -13,6 +13,9 @@ target triple = "msp430-generic-generic" ; instruction RETI, which restores the SR register and branches to the PC where ; the interrupt occurred. +; CHECK: .section __interrupt_vector_2,"ax",@progbits +; CHECK-NEXT: .short ISR + @g = global float 0.0 define msp430_intrcc void @ISR() #0 { @@ -47,3 +50,4 @@ entry: ret void } +attributes #0 = { noinline nounwind optnone "interrupt"="2" } diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index 617e198bce4c..de20c07d4139 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -1183,38 +1183,58 @@ define <4 x i32> @test_x86_avx2_psllv_d(<4 x i32> %a0, <4 x i32> %a1) { define <4 x i32> @test_x86_avx2_psllv_d_const() { ; X86-AVX-LABEL: test_x86_avx2_psllv_d_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x58,0x05,A,A,A,A] +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] +; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpsllvd {{\.LCPI.*}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x47,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 -; X86-AVX-NEXT: vpaddd {{\.LCPI.*}}, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0x05,A,A,A,A] +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,4294967295] +; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpsllvd %xmm1, %xmm1, %xmm1 # encoding: [0xc4,0xe2,0x71,0x47,0xc9] +; X86-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc1] ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psllv_d_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = <4,9,0,u> +; X86-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [2,9,0,4294967295] ; X86-AVX512VL-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpaddd {{\.LCPI.*}}{1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x18,0xfe,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsllvd {{\.LCPI.*}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %xmm1 # EVEX TO VEX Compression xmm1 = [1,1,1,4294967295] +; X86-AVX512VL-NEXT: # encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsllvd %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x47,0xc9] +; X86-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psllv_d_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x58,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] +; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x47,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,4294967295] +; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpsllvd %xmm1, %xmm1, %xmm1 # encoding: [0xc4,0xe2,0x71,0x47,0xc9] +; X64-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc1] ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psllv_d_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = <4,9,0,u> +; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [2,9,0,4294967295] ; X64-AVX512VL-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x18,0xfe,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x47,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm1 # EVEX TO VEX Compression xmm1 = [1,1,1,4294967295] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpsllvd %xmm1, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x47,0xc9] +; X64-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res0 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> , <4 x i32> ) %res1 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> , <4 x i32> ) @@ -1241,38 +1261,62 @@ define <8 x i32> @test_x86_avx2_psllv_d_256(<8 x i32> %a0, <8 x i32> %a1) { define <8 x i32> @test_x86_avx2_psllv_d_256_const() { ; X86-AVX-LABEL: test_x86_avx2_psllv_d_256_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8,8,8,8,8,8,8,8] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x58,0x05,A,A,A,A] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpsllvd {{\.LCPI.*}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x47,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 -; X86-AVX-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0x05,A,A,A,A] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpsllvd {{\.LCPI.*}}, %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x75,0x47,0x0d,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc1] ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psllv_d_256_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = <4,9,0,u,12,7,u,0> +; X86-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [2,9,0,4294967295,3,7,4294967295,0] ; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x38,0xfe,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsllvd {{\.LCPI.*}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %ymm1 # EVEX TO VEX Compression ymm1 = [4,4,4,4,4,4,4,4294967295] +; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsllvd {{\.LCPI.*}}, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x47,0x0d,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psllv_d_256_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8,8,8,8,8,8,8,8] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x58,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x47,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpsllvd {{.*}}(%rip), %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x75,0x47,0x0d,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc1] ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psllv_d_256_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = <4,9,0,u,12,7,u,0> +; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [2,9,0,4294967295,3,7,4294967295,0] ; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x38,0xfe,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x47,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm1 # EVEX TO VEX Compression ymm1 = [4,4,4,4,4,4,4,4294967295] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x47,0x0d,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> , <8 x i32> ) %res1 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> , <8 x i32> ) @@ -1316,14 +1360,20 @@ define <2 x i64> @test_x86_avx2_psllv_q_const() { ; ; X64-AVX-LABEL: test_x86_avx2_psllv_q_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: movl $8, %eax # encoding: [0xb8,0x08,0x00,0x00,0x00] -; X64-AVX-NEXT: vmovq %rax, %xmm0 # encoding: [0xc4,0xe1,0xf9,0x6e,0xc0] +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,18446744073709551615] +; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x47,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psllv_q_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: movl $8, %eax # encoding: [0xb8,0x08,0x00,0x00,0x00] -; X64-AVX512VL-NEXT: vmovq %rax, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x6e,0xc0] +; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [4,18446744073709551615] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x47,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> , <2 x i64> ) ret <2 x i64> %res @@ -1366,15 +1416,19 @@ define <4 x i64> @test_x86_avx2_psllv_q_256_const() { ; ; X64-AVX-LABEL: test_x86_avx2_psllv_q_256_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [8,8,8,8] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x19,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [4,4,4,18446744073709551615] +; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0xfd,0x47,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psllv_q_256_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vbroadcastsd {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [8,8,8,8] -; X64-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x19,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [4,4,4,18446744073709551615] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x47,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> , <4 x i64> ) @@ -1400,38 +1454,62 @@ define <4 x i32> @test_x86_avx2_psrlv_d(<4 x i32> %a0, <4 x i32> %a1) { define <4 x i32> @test_x86_avx2_psrlv_d_const() { ; X86-AVX-LABEL: test_x86_avx2_psrlv_d_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x58,0x05,A,A,A,A] +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] +; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpsrlvd {{\.LCPI.*}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x45,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 -; X86-AVX-NEXT: vpaddd {{\.LCPI.*}}, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0x05,A,A,A,A] +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4294967295] +; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpsrlvd {{\.LCPI.*}}, %xmm1, %xmm1 # encoding: [0xc4,0xe2,0x71,0x45,0x0d,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc1] ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_d_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = <1,9,0,u> +; X86-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [2,9,0,4294967295] ; X86-AVX512VL-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpaddd {{\.LCPI.*}}{1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x18,0xfe,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsrlvd {{\.LCPI.*}}, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %xmm1 # EVEX TO VEX Compression xmm1 = [4,4,4,4294967295] +; X86-AVX512VL-NEXT: # encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsrlvd {{\.LCPI.*}}, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x45,0x0d,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrlv_d_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpbroadcastd {{.*#+}} xmm0 = [2,2,2,2] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x58,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,0,4294967295] +; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0x45,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4294967295] +; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 # encoding: [0xc4,0xe2,0x71,0x45,0x0d,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xfe,0xc1] ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_d_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = <1,9,0,u> +; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [2,9,0,4294967295] ; X64-AVX512VL-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x18,0xfe,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x45,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm1 # EVEX TO VEX Compression xmm1 = [4,4,4,4294967295] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xf9,0x6f,0x0d,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x45,0x0d,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res0 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> , <4 x i32> ) %res1 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> , <4 x i32> ) @@ -1458,38 +1536,62 @@ define <8 x i32> @test_x86_avx2_psrlv_d_256(<8 x i32> %a0, <8 x i32> %a1) { define <8 x i32> @test_x86_avx2_psrlv_d_256_const() { ; X86-AVX-LABEL: test_x86_avx2_psrlv_d_256_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] -; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x58,0x05,A,A,A,A] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpsrlvd {{\.LCPI.*}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x45,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 -; X86-AVX-NEXT: vpaddd {{\.LCPI.*}}, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0x05,A,A,A,A] +; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpsrlvd {{\.LCPI.*}}, %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x75,0x45,0x0d,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc1] ; X86-AVX-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrlv_d_256_const: ; X86-AVX512VL: # %bb.0: -; X86-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = <1,9,0,u,0,7,u,0> +; X86-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [2,9,0,4294967295,3,7,4294967295,0] ; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpaddd {{\.LCPI.*}}{1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x38,0xfe,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsrlvd {{\.LCPI.*}}, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vmovdqa {{\.LCPI.*}}, %ymm1 # EVEX TO VEX Compression ymm1 = [4,4,4,4,4,4,4,4294967295] +; X86-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsrlvd {{\.LCPI.*}}, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x45,0x0d,A,A,A,A] +; X86-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] ; X86-AVX512VL-NEXT: retl # encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrlv_d_256_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpbroadcastd {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x58,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,0,4294967295,3,7,4294967295,0] +; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0x7d,0x45,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte -; X64-AVX-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0x05,A,A,A,A] +; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4294967295] +; X64-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1 # encoding: [0xc4,0xe2,0x75,0x45,0x0d,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfe,0xc1] ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_d_256_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = <1,9,0,u,0,7,u,0> +; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [2,9,0,4294967295,3,7,4294967295,0] ; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte -; X64-AVX512VL-NEXT: vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x38,0xfe,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x45,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm1 # EVEX TO VEX Compression ymm1 = [4,4,4,4,4,4,4,4294967295] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xfd,0x6f,0x0d,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x45,0x0d,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> , <8 x i32> ) %res1 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> , <8 x i32> ) @@ -1534,14 +1636,20 @@ define <2 x i64> @test_x86_avx2_psrlv_q_const() { ; ; X64-AVX-LABEL: test_x86_avx2_psrlv_q_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: movl $2, %eax # encoding: [0xb8,0x02,0x00,0x00,0x00] -; X64-AVX-NEXT: vmovq %rax, %xmm0 # encoding: [0xc4,0xe1,0xf9,0x6e,0xc0] +; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,4] +; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_q_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: movl $2, %eax # encoding: [0xb8,0x02,0x00,0x00,0x00] -; X64-AVX512VL-NEXT: vmovq %rax, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x6e,0xc0] +; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [4,4] +; X64-AVX512VL-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> , <2 x i64> ) ret <2 x i64> %res @@ -1585,15 +1693,19 @@ define <4 x i64> @test_x86_avx2_psrlv_q_256_const() { ; ; X64-AVX-LABEL: test_x86_avx2_psrlv_q_256_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2,2,2,2] -; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x19,0x05,A,A,A,A] +; X64-AVX-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,4,4,4] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x59,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 # encoding: [0xc4,0xe2,0xfd,0x45,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrlv_q_256_const: ; X64-AVX512VL: # %bb.0: -; X64-AVX512VL-NEXT: vbroadcastsd {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [2,2,2,2] -; X64-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x19,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: vpbroadcastq {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [4,4,4,4] +; X64-AVX512VL-NEXT: # encoding: [0xc4,0xe2,0x7d,0x59,0x05,A,A,A,A] +; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0x45,0x05,A,A,A,A] ; X64-AVX512VL-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> , <4 x i64> ) diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index a0e8393309de..6a1d9d3da917 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -5229,8 +5229,11 @@ define <16 x i32> @test_x86_avx512_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1) { define <16 x i32> @test_x86_avx512_psllv_d_512_const() { ; CHECK-LABEL: test_x86_avx512_psllv_d_512_const: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = <4,9,0,u,12,7,u,0,32,5,u,0,80,3,u,0> -; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] +; CHECK-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] +; CHECK-NEXT: vpsllvd {{.*}}(%rip), %zmm1, %zmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res0 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> , <16 x i32> ) %res1 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> , <16 x i32> ) @@ -5277,8 +5280,11 @@ define <8 x i64> @test_x86_avx512_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1) { define <8 x i64> @test_x86_avx512_psllv_q_512_const() { ; CHECK-LABEL: test_x86_avx512_psllv_q_512_const: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = <4,9,0,u,12,7,18446744056529682432,0> -; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0] +; CHECK-NEXT: vpsllvq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615] +; CHECK-NEXT: vpsllvq {{.*}}(%rip), %zmm1, %zmm1 +; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res0 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> , <8 x i64> ) %res1 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> , <8 x i64> ) @@ -5397,8 +5403,11 @@ define <16 x i32> @test_x86_avx512_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1) { define <16 x i32> @test_x86_avx512_psrlv_d_512_const() { ; CHECK-LABEL: test_x86_avx512_psrlv_d_512_const: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = <1,9,0,u,0,7,u,0,0,5,u,0,0,3,u,0> -; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,4294967295,3,7,4294967295,0,4,5,4294967294,0,5,3,4294967293,0] +; CHECK-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4294967295] +; CHECK-NEXT: vpsrlvd {{.*}}(%rip), %zmm1, %zmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res0 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> , <16 x i32> ) %res1 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> , <16 x i32> ) @@ -5445,8 +5454,11 @@ define <8 x i64> @test_x86_avx512_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1) { define <8 x i64> @test_x86_avx512_psrlv_q_512_const() { ; CHECK-LABEL: test_x86_avx512_psrlv_q_512_const: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = <1,9,0,u,0,7,1073741823,0> -; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,9,0,18446744073709551615,3,7,18446744073709551615,0] +; CHECK-NEXT: vpsrlvq {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,4,4,4,4,4,4,18446744073709551615] +; CHECK-NEXT: vpsrlvq {{.*}}(%rip), %zmm1, %zmm1 +; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res0 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> , <8 x i64> ) %res1 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> , <8 x i64> ) diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 8bcdc5d5c029..a220ab0ad734 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1158,15 +1158,19 @@ declare <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16>, <32 x i16>) nounwind define <32 x i16> @test_x86_avx512_psrlv_w_512_const() optsize { ; X86-LABEL: test_x86_avx512_psrlv_w_512_const: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; X86-NEXT: # encoding: [0x62,0xf2,0x7d,0x48,0x79,0x05,A,A,A,A] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] +; X86-NEXT: # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-NEXT: vpsrlvw {{\.LCPI.*}}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x10,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_avx512_psrlv_w_512_const: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw {{.*#+}} zmm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; X64-NEXT: # encoding: [0x62,0xf2,0x7d,0x48,0x79,0x05,A,A,A,A] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] +; X64-NEXT: # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x10,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-NEXT: retq # encoding: [0xc3] %res1 = call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> , <32 x i16> ) @@ -1377,15 +1381,19 @@ declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) nounwind r define <32 x i16> @test_x86_avx512_psllv_w_512_const() optsize { ; X86-LABEL: test_x86_avx512_psllv_w_512_const: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw {{.*#+}} zmm0 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X86-NEXT: # encoding: [0x62,0xf2,0x7d,0x48,0x79,0x05,A,A,A,A] +; X86-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] +; X86-NEXT: # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-NEXT: vpsllvw {{\.LCPI.*}}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x12,0x05,A,A,A,A] ; X86-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_x86_avx512_psllv_w_512_const: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw {{.*#+}} zmm0 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X64-NEXT: # encoding: [0x62,0xf2,0x7d,0x48,0x79,0x05,A,A,A,A] +; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] +; X64-NEXT: # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x12,0x05,A,A,A,A] ; X64-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-NEXT: retq # encoding: [0xc3] %res1 = call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> , <32 x i16> ) diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll index 10ba0e646055..a01252f7d494 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -2021,16 +2021,20 @@ define <8 x i16>@test_int_x86_avx512_mask_psrlv8_hi(<8 x i16> %x0, <8 x i16> %x1 define <8 x i16> @test_int_x86_avx512_psrlv_w_128_const() optsize { ; X86-LABEL: test_int_x86_avx512_psrlv_w_128_const: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [2,2,2,2,2,2,2,2] -; X86-NEXT: # encoding: [0xc4,0xe2,0x79,0x79,0x05,A,A,A,A] -; X86-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-NEXT: vmovdqa {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [4,4,4,4,4,4,4,65535] +; X86-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-NEXT: vpsrlvw {{\.LCPI.*}}, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x10,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_psrlv_w_128_const: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [2,2,2,2,2,2,2,2] -; X64-NEXT: # encoding: [0xc4,0xe2,0x79,0x79,0x05,A,A,A,A] -; X64-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vmovdqa {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [4,4,4,4,4,4,4,65535] +; X64-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x10,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> , <8 x i16> ) ret <8 x i16> %res @@ -2041,16 +2045,20 @@ declare <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16>, <8 x i16>) define <16 x i16> @test_int_x86_avx512_psrlv_w_256_const() optsize { ; X86-LABEL: test_int_x86_avx512_psrlv_w_256_const: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; X86-NEXT: # encoding: [0xc4,0xe2,0x7d,0x79,0x05,A,A,A,A] -; X86-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-NEXT: vmovdqa {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] +; X86-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-NEXT: vpsrlvw {{\.LCPI.*}}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x10,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_psrlv_w_256_const: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; X64-NEXT: # encoding: [0xc4,0xe2,0x7d,0x79,0x05,A,A,A,A] -; X64-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vmovdqa {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] +; X64-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x10,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> , <16 x i16> ) ret <16 x i16> %res @@ -2195,16 +2203,20 @@ define <8 x i16>@test_int_x86_avx512_mask_psllv8_hi(<8 x i16> %x0, <8 x i16> %x1 define <8 x i16> @test_int_x86_avx512_psllv_w_128_const() optsize { ; X86-LABEL: test_int_x86_avx512_psllv_w_128_const: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [8,8,8,8,8,8,8,8] -; X86-NEXT: # encoding: [0xc4,0xe2,0x79,0x79,0x05,A,A,A,A] -; X86-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-NEXT: vmovdqa {{\.LCPI.*}}, %xmm0 # EVEX TO VEX Compression xmm0 = [4,4,4,4,4,4,4,65535] +; X86-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-NEXT: vpsllvw {{\.LCPI.*}}, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x12,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_psllv_w_128_const: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [8,8,8,8,8,8,8,8] -; X64-NEXT: # encoding: [0xc4,0xe2,0x79,0x79,0x05,A,A,A,A] -; X64-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vmovdqa {{.*}}(%rip), %xmm0 # EVEX TO VEX Compression xmm0 = [4,4,4,4,4,4,4,65535] +; X64-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x12,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> , <8 x i16> ) ret <8 x i16> %res @@ -2216,16 +2228,20 @@ declare <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16>, <8 x i16>) define <16 x i16> @test_int_x86_avx512_psllv_w_256_const() optsize { ; X86-LABEL: test_int_x86_avx512_psllv_w_256_const: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X86-NEXT: # encoding: [0xc4,0xe2,0x7d,0x79,0x05,A,A,A,A] -; X86-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-NEXT: vmovdqa {{\.LCPI.*}}, %ymm0 # EVEX TO VEX Compression ymm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] +; X86-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 +; X86-NEXT: vpsllvw {{\.LCPI.*}}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x12,0x05,A,A,A,A] +; X86-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_psllv_w_256_const: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X64-NEXT: # encoding: [0xc4,0xe2,0x7d,0x79,0x05,A,A,A,A] -; X64-NEXT: # fixup A - offset: 5, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vmovdqa {{.*}}(%rip), %ymm0 # EVEX TO VEX Compression ymm0 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,65535] +; X64-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte +; X64-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x12,0x05,A,A,A,A] +; X64-NEXT: # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> , <16 x i16> ) ret <16 x i16> %res diff --git a/test/Transforms/InstCombine/sink-alloca.ll b/test/Transforms/InstCombine/sink-alloca.ll new file mode 100644 index 000000000000..f2de74ff533b --- /dev/null +++ b/test/Transforms/InstCombine/sink-alloca.ll @@ -0,0 +1,52 @@ +; RUN: opt -instcombine -S < %s | FileCheck %s + +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i686-unknown-linux-gnu" + +; Check that instcombine doesn't sink dynamic allocas across llvm.stacksave. + +; Helper to generate branch conditions. +declare i1 @cond() + +declare i32* @use_and_return(i32*) + +declare i8* @llvm.stacksave() #0 + +declare void @llvm.stackrestore(i8*) #0 + +define void @foo(i32 %x) { +entry: + %c1 = call i1 @cond() + br i1 %c1, label %ret, label %nonentry + +nonentry: ; preds = %entry + %argmem = alloca i32, i32 %x, align 4 + %sp = call i8* @llvm.stacksave() + %c2 = call i1 @cond() + br i1 %c2, label %ret, label %sinktarget + +sinktarget: ; preds = %nonentry + ; Arrange for there to be a single use of %argmem by returning it. + %p = call i32* @use_and_return(i32* nonnull %argmem) + store i32 13, i32* %p, align 4 + call void @llvm.stackrestore(i8* %sp) + %0 = call i32* @use_and_return(i32* %p) + br label %ret + +ret: ; preds = %sinktarget, %nonentry, %entry + ret void +} + +; CHECK-LABEL: define void @foo(i32 %x) +; CHECK: nonentry: +; CHECK: %argmem = alloca i32, i32 %x +; CHECK: %sp = call i8* @llvm.stacksave() +; CHECK: %c2 = call i1 @cond() +; CHECK: br i1 %c2, label %ret, label %sinktarget +; CHECK: sinktarget: +; CHECK: %p = call i32* @use_and_return(i32* nonnull %argmem) +; CHECK: store i32 13, i32* %p +; CHECK: call void @llvm.stackrestore(i8* %sp) +; CHECK: %0 = call i32* @use_and_return(i32* %p) + +attributes #0 = { nounwind } diff --git a/test/Transforms/SLPVectorizer/X86/PR39774.ll b/test/Transforms/SLPVectorizer/X86/PR39774.ll index 8405117090b4..67717a54659c 100644 --- a/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -3,93 +3,185 @@ ; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-8 -slp-min-tree-size=6 | FileCheck %s --check-prefixes=ALL,FORCE_REDUCTION define void @Test(i32) { -; ALL-LABEL: @Test( -; ALL-NEXT: entry: -; ALL-NEXT: br label [[LOOP:%.*]] -; ALL: loop: -; ALL-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP11:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; ALL-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> -; ALL-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 -; ALL-NEXT: [[TMP3:%.*]] = add <8 x i32> , [[SHUFFLE]] -; ALL-NEXT: [[VAL_1:%.*]] = and i32 [[TMP2]], undef -; ALL-NEXT: [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]] -; ALL-NEXT: [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]] -; ALL-NEXT: [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]] -; ALL-NEXT: [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]] -; ALL-NEXT: [[VAL_7:%.*]] = and i32 [[VAL_5]], undef -; ALL-NEXT: [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]] -; ALL-NEXT: [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]] -; ALL-NEXT: [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]] -; ALL-NEXT: [[VAL_12:%.*]] = and i32 [[VAL_10]], undef -; ALL-NEXT: [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]] -; ALL-NEXT: [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]] -; ALL-NEXT: [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]] -; ALL-NEXT: [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]] -; ALL-NEXT: [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]] -; ALL-NEXT: [[VAL_19:%.*]] = and i32 [[VAL_17]], undef -; ALL-NEXT: [[VAL_21:%.*]] = and i32 [[VAL_19]], undef -; ALL-NEXT: [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]] -; ALL-NEXT: [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]] -; ALL-NEXT: [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]] -; ALL-NEXT: [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]] -; ALL-NEXT: [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]] -; ALL-NEXT: [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]] -; ALL-NEXT: [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]] -; ALL-NEXT: [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]] -; ALL-NEXT: [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]] -; ALL-NEXT: [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]] -; ALL-NEXT: [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]] -; ALL-NEXT: [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]] -; ALL-NEXT: [[VAL_35:%.*]] = and i32 [[VAL_33]], undef -; ALL-NEXT: [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]] -; ALL-NEXT: [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]] -; ALL-NEXT: [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]] -; ALL-NEXT: [[VAL_40:%.*]] = and i32 [[VAL_38]], undef -; ALL-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_40]], i32 0 -; ALL-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP2]], i32 1 -; ALL-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 -; ALL-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP6]], i32 0 -; ALL-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 14910, i32 1 -; ALL-NEXT: [[TMP9:%.*]] = and <2 x i32> [[TMP5]], [[TMP8]] -; ALL-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP5]], [[TMP8]] -; ALL-NEXT: [[TMP11]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> -; ALL-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> undef, <8 x i32> -; ALL-NEXT: [[BIN_RDX:%.*]] = and <8 x i32> [[TMP3]], [[RDX_SHUF]] -; ALL-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; ALL-NEXT: [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; ALL-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; ALL-NEXT: [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; ALL-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; ALL-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP12]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA28:%.*]] = and i32 [[OP_EXTRA27]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA29:%.*]] = and i32 [[OP_EXTRA28]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA30:%.*]] = and i32 [[OP_EXTRA29]], [[TMP0]] -; ALL-NEXT: [[OP_EXTRA31:%.*]] = and i32 [[OP_EXTRA30]], [[TMP2]] -; ALL-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 -; ALL-NEXT: br label [[LOOP]] +; CHECK-LABEL: @Test( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> , [[SHUFFLE]] +; CHECK-NEXT: [[VAL_1:%.*]] = and i32 [[TMP2]], undef +; CHECK-NEXT: [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]] +; CHECK-NEXT: [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]] +; CHECK-NEXT: [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]] +; CHECK-NEXT: [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]] +; CHECK-NEXT: [[VAL_7:%.*]] = and i32 [[VAL_5]], undef +; CHECK-NEXT: [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]] +; CHECK-NEXT: [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]] +; CHECK-NEXT: [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]] +; CHECK-NEXT: [[VAL_12:%.*]] = and i32 [[VAL_10]], undef +; CHECK-NEXT: [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]] +; CHECK-NEXT: [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]] +; CHECK-NEXT: [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]] +; CHECK-NEXT: [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]] +; CHECK-NEXT: [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]] +; CHECK-NEXT: [[VAL_19:%.*]] = and i32 [[VAL_17]], undef +; CHECK-NEXT: [[VAL_21:%.*]] = and i32 [[VAL_19]], undef +; CHECK-NEXT: [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]] +; CHECK-NEXT: [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]] +; CHECK-NEXT: [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]] +; CHECK-NEXT: [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]] +; CHECK-NEXT: [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]] +; CHECK-NEXT: [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]] +; CHECK-NEXT: [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]] +; CHECK-NEXT: [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]] +; CHECK-NEXT: [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]] +; CHECK-NEXT: [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]] +; CHECK-NEXT: [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]] +; CHECK-NEXT: [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]] +; CHECK-NEXT: [[VAL_35:%.*]] = and i32 [[VAL_33]], undef +; CHECK-NEXT: [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]] +; CHECK-NEXT: [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]] +; CHECK-NEXT: [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]] +; CHECK-NEXT: [[VAL_40:%.*]] = and i32 [[VAL_38]], undef +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = and <8 x i32> [[TMP3]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP4]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA28:%.*]] = and i32 [[OP_EXTRA27]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA29:%.*]] = and i32 [[OP_EXTRA28]], [[TMP0]] +; CHECK-NEXT: [[OP_EXTRA30:%.*]] = and i32 [[OP_EXTRA29]], [[TMP0]] +; CHECK-NEXT: [[VAL_42:%.*]] = and i32 [[VAL_40]], undef +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> undef, i32 [[OP_EXTRA30]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 14910, i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = and <2 x i32> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> undef, i32 [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP15]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP14]], i32 1 +; CHECK-NEXT: br label [[LOOP]] +; +; FORCE_REDUCTION-LABEL: @Test( +; FORCE_REDUCTION-NEXT: entry: +; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] +; FORCE_REDUCTION: loop: +; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> +; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <4 x i32> , [[SHUFFLE]] +; FORCE_REDUCTION-NEXT: [[VAL_1:%.*]] = and i32 [[TMP2]], undef +; FORCE_REDUCTION-NEXT: [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]] +; FORCE_REDUCTION-NEXT: [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_7:%.*]] = and i32 [[VAL_5]], undef +; FORCE_REDUCTION-NEXT: [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_12:%.*]] = and i32 [[VAL_10]], undef +; FORCE_REDUCTION-NEXT: [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_19:%.*]] = and i32 [[VAL_17]], undef +; FORCE_REDUCTION-NEXT: [[VAL_20:%.*]] = add i32 [[TMP2]], 1496 +; FORCE_REDUCTION-NEXT: [[VAL_21:%.*]] = and i32 [[VAL_19]], [[VAL_20]] +; FORCE_REDUCTION-NEXT: [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_34:%.*]] = add i32 [[TMP2]], 8555 +; FORCE_REDUCTION-NEXT: [[VAL_35:%.*]] = and i32 [[VAL_33]], [[VAL_34]] +; FORCE_REDUCTION-NEXT: [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> +; FORCE_REDUCTION-NEXT: [[BIN_RDX:%.*]] = and <4 x i32> [[TMP3]], [[RDX_SHUF]] +; FORCE_REDUCTION-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> +; FORCE_REDUCTION-NEXT: [[BIN_RDX2:%.*]] = and <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], [[VAL_20]] +; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], [[VAL_34]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP6]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA3:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA4:%.*]] = and i32 [[OP_EXTRA3]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA4]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA28:%.*]] = and i32 [[OP_EXTRA27]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[OP_EXTRA29:%.*]] = and i32 [[OP_EXTRA28]], [[TMP2]] +; FORCE_REDUCTION-NEXT: [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]] +; FORCE_REDUCTION-NEXT: [[VAL_39:%.*]] = add i32 [[TMP2]], 12529 +; FORCE_REDUCTION-NEXT: [[VAL_40:%.*]] = and i32 [[OP_EXTRA29]], [[VAL_39]] +; FORCE_REDUCTION-NEXT: [[VAL_41:%.*]] = add i32 [[TMP2]], 13685 +; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_40]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_41]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 14910, i32 1 +; FORCE_REDUCTION-NEXT: [[TMP11:%.*]] = and <2 x i32> [[TMP8]], [[TMP10]] +; FORCE_REDUCTION-NEXT: [[TMP12:%.*]] = add <2 x i32> [[TMP8]], [[TMP10]] +; FORCE_REDUCTION-NEXT: [[TMP13]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> +; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; entry: br label %loop diff --git a/test/Transforms/SLPVectorizer/X86/PR40310.ll b/test/Transforms/SLPVectorizer/X86/PR40310.ll index 74e62e0e4ba2..ad1434146a5b 100644 --- a/test/Transforms/SLPVectorizer/X86/PR40310.ll +++ b/test/Transforms/SLPVectorizer/X86/PR40310.ll @@ -7,7 +7,7 @@ define void @mainTest(i32 %param, i32 * %vals, i32 %len) { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[PARAM:%.*]], i32 1 ; CHECK-NEXT: br label [[BCI_15:%.*]] ; CHECK: bci_15: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP11:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ] ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15 @@ -28,13 +28,6 @@ define void @mainTest(i32 %param, i32 * %vals, i32 %len) { ; CHECK-NEXT: [[V38:%.*]] = and i32 undef, [[V36]] ; CHECK-NEXT: [[V40:%.*]] = and i32 undef, [[V38]] ; CHECK-NEXT: [[V42:%.*]] = and i32 undef, [[V40]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i32> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP6]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> , i32 [[V42]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = and <2 x i32> [[TMP7]], [[TMP8]] -; CHECK-NEXT: [[TMP11]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> undef, <16 x i32> ; CHECK-NEXT: [[BIN_RDX:%.*]] = and <16 x i32> [[TMP4]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> @@ -43,9 +36,12 @@ define void @mainTest(i32 %param, i32 * %vals, i32 %len) { ; CHECK-NEXT: [[BIN_RDX4:%.*]] = and <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] ; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> ; CHECK-NEXT: [[BIN_RDX6:%.*]] = and <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0 -; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP12]], [[TMP2]] -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0 +; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[V43:%.*]] = and i32 undef, [[V42]] +; CHECK-NEXT: [[V44:%.*]] = add i32 [[TMP2]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[V44]], i32 0 +; CHECK-NEXT: [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1 ; CHECK-NEXT: br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]] ; CHECK: loopexit: ; CHECK-NEXT: ret void diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll index a72da6399f22..2c5829d6fcec 100644 --- a/test/Transforms/SROA/basictest.ll +++ b/test/Transforms/SROA/basictest.ll @@ -1745,6 +1745,55 @@ entry: ret void } +declare void @llvm.lifetime.start.isVoid.i64.p0i8(i64, [10 x float]* nocapture) +declare void @llvm.lifetime.end.isVoid.i64.p0i8(i64, [10 x float]* nocapture) +@array = dso_local global [10 x float] undef, align 4 + +define void @test29(i32 %num, i32 %tid) { +; CHECK-LABEL: @test29( +; CHECK-NOT: alloca [10 x float] +; CHECK: ret void + +entry: + %ra = alloca [10 x float], align 4 + call void @llvm.lifetime.start.isVoid.i64.p0i8(i64 40, [10 x float]* nonnull %ra) + + %cmp1 = icmp sgt i32 %num, 0 + br i1 %cmp1, label %bb1, label %bb7 + +bb1: + %tobool = icmp eq i32 %tid, 0 + %conv.i = zext i32 %tid to i64 + %0 = bitcast [10 x float]* %ra to i32* + %1 = load i32, i32* %0, align 4 + %arrayidx5 = getelementptr inbounds [10 x float], [10 x float]* @array, i64 0, i64 %conv.i + %2 = bitcast float* %arrayidx5 to i32* + br label %bb2 + +bb2: + %i.02 = phi i32 [ %num, %bb1 ], [ %sub, %bb5 ] + br i1 %tobool, label %bb3, label %bb4 + +bb3: + br label %bb5 + +bb4: + store i32 %1, i32* %2, align 4 + br label %bb5 + +bb5: + %sub = add i32 %i.02, -1 + %cmp = icmp sgt i32 %sub, 0 + br i1 %cmp, label %bb2, label %bb6 + +bb6: + br label %bb7 + +bb7: + call void @llvm.lifetime.end.isVoid.i64.p0i8(i64 40, [10 x float]* nonnull %ra) + ret void +} + !0 = !{!1, !1, i64 0, i64 1} !1 = !{!2, i64 1, !"type_0"} !2 = !{!"root"} diff --git a/utils/release/build_llvm_package.bat b/utils/release/build_llvm_package.bat index 51f425633a3d..6a0b19b71960 100755 --- a/utils/release/build_llvm_package.bat +++ b/utils/release/build_llvm_package.bat @@ -54,7 +54,7 @@ svn.exe export -r %revision% http://llvm.org/svn/llvm-project/lldb/%branch% llvm REM Setting CMAKE_CL_SHOWINCLUDES_PREFIX to work around PR27226. set cmake_flags=-DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON -DCMAKE_INSTALL_UCRT_LIBRARIES=ON -DCLANG_FORMAT_VS_VERSION=%clang_format_vs_version% -DPACKAGE_VERSION=%package_version% -DLLDB_RELOCATABLE_PYTHON=1 -DLLDB_TEST_COMPILER=%cd%\build32_stage0\bin\clang.exe -DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: " -REM TODO: Run all tests, including lld and compiler-rt. +REM TODO: Run the "check-all" tests. set "VSCMD_START_DIR=%CD%" call "%vsdevcmd%" -arch=x86 @@ -66,7 +66,9 @@ REM Work around VS2017 bug by using MinSizeRel. cmake -GNinja %cmake_flags% -DPYTHON_HOME=%python32_dir% -DCMAKE_BUILD_TYPE=MinSizeRel ..\llvm || exit /b ninja all || ninja all || ninja all || exit /b ninja check || ninja check || ninja check || exit /b -ninja check-clang || ninja check-clang || ninja check-clang || exit /b +ninja check-clang || ninja check-clang || ninja check-clang || exit /b +ninja check-lld || ninja check-lld || ninja check-lld || exit /b +ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b cd.. mkdir build32 @@ -76,7 +78,9 @@ set CXX=..\build32_stage0\bin\clang-cl cmake -GNinja %cmake_flags% -DPYTHON_HOME=%python32_dir% ..\llvm || exit /b ninja all || ninja all || ninja all || exit /b ninja check || ninja check || ninja check || exit /b -ninja check-clang || ninja check-clang || ninja check-clang || exit /b +ninja check-clang || ninja check-clang || ninja check-clang || exit /b +ninja check-lld || ninja check-lld || ninja check-lld || exit /b +ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b ninja package || exit /b cd .. @@ -101,7 +105,9 @@ REM Work around VS2017 bug by using MinSizeRel. cmake -GNinja %cmake_flags% -DPYTHON_HOME=%python64_dir% -DCMAKE_BUILD_TYPE=MinSizeRel ..\llvm || exit /b ninja all || ninja all || ninja all || exit /b ninja check || ninja check || ninja check || exit /b -ninja check-clang || ninja check-clang || ninja check-clang || exit /b +ninja check-clang || ninja check-clang || ninja check-clang || exit /b +ninja check-lld || ninja check-lld || ninja check-lld || exit /b +ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b cd.. mkdir build64 @@ -111,6 +117,8 @@ set CXX=..\build64_stage0\bin\clang-cl cmake -GNinja %cmake_flags% -DPYTHON_HOME=%python64_dir% ..\llvm || exit /b ninja all || ninja all || ninja all || exit /b ninja check || ninja check || ninja check || exit /b -ninja check-clang || ninja check-clang || ninja check-clang || exit /b +ninja check-clang || ninja check-clang || ninja check-clang || exit /b +ninja check-lld || ninja check-lld || ninja check-lld || exit /b +ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b ninja package || exit /b cd .. -- cgit v1.2.3