diff options
Diffstat (limited to 'lib/Target/AArch64/AArch64ISelLowering.cpp')
-rw-r--r-- | lib/Target/AArch64/AArch64ISelLowering.cpp | 535 |
1 files changed, 454 insertions, 81 deletions
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 7becc99fb5c7..2746117e8ee5 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -23,6 +23,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -161,6 +162,29 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addQRTypeForNEON(MVT::v8f16); } + if (Subtarget->hasSVE()) { + // Add legal sve predicate types + addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass); + addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass); + addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass); + addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass); + + // Add legal sve data types + addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass); + + addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv1f32, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv1f64, &AArch64::ZPRRegClass); + addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass); + } + // Compute derived properties from the register classes computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -283,7 +307,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // AArch64 lacks both left-rotate and popcount instructions. setOperationAction(ISD::ROTL, MVT::i32, Expand); setOperationAction(ISD::ROTL, MVT::i64, Expand); - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); } @@ -297,7 +321,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); } @@ -606,6 +630,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4; + MaxLoadsPerMemcmpOptSize = 4; + MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign() + ? MaxLoadsPerMemcmpOptSize : 8; + setStackPointerRegisterToSaveRestore(AArch64::SP); setSchedulingPreference(Sched::Hybrid); @@ -613,10 +641,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, EnableExtLdPromotion = true; // Set required alignment. - setMinFunctionAlignment(2); + setMinFunctionAlignment(Align(4)); // Set preferred alignments. - setPrefFunctionAlignment(STI.getPrefFunctionAlignment()); - setPrefLoopAlignment(STI.getPrefLoopAlignment()); + setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment())); + setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment())); // Only change the limit for entries in a jump table if specified by // the sub target, but not at the command line. @@ -725,7 +753,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled // directly. - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { @@ -741,7 +769,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); - for (MVT InnerVT : MVT::vector_valuetypes()) { + for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); @@ -773,6 +801,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); } + if (Subtarget->hasSVE()) { + for (MVT VT : MVT::integer_scalable_vector_valuetypes()) { + if (isTypeLegal(VT) && VT.getVectorElementType() != MVT::i1) + setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); + } + } + PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); } @@ -1025,6 +1060,14 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( Known.One &= Known2.One; break; } + case AArch64ISD::LOADgot: + case AArch64ISD::ADDlow: { + if (!Subtarget->isTargetILP32()) + break; + // In ILP32 mode all valid pointers are in the low 4GB of the address-space. + Known.Zero = APInt::getHighBitsSet(64, 32); + break; + } case ISD::INTRINSIC_W_CHAIN: { ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); @@ -1100,6 +1143,32 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( return true; } +// Same as above but handling LLTs instead. +bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( + LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *Fast) const { + if (Subtarget->requiresStrictAlign()) + return false; + + if (Fast) { + // Some CPUs are fine with unaligned stores except for 128-bit ones. + *Fast = !Subtarget->isMisaligned128StoreSlow() || + Ty.getSizeInBytes() != 16 || + // See comments in performSTORECombine() for more details about + // these conditions. + + // Code that uses clang vector extensions can mark that it + // wants unaligned accesses to be treated as fast by + // underspecifying alignment to be 1 or 2. + Align <= 2 || + + // Disregard v2i64. Memcpy lowering produces those and splitting + // them regresses performance on micro-benchmarks and olden/bh. + Ty == LLT::vector(2, 64); + } + return true; +} + FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { @@ -1238,6 +1307,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::STZG: return "AArch64ISD::STZG"; case AArch64ISD::ST2G: return "AArch64ISD::ST2G"; case AArch64ISD::STZ2G: return "AArch64ISD::STZ2G"; + case AArch64ISD::SUNPKHI: return "AArch64ISD::SUNPKHI"; + case AArch64ISD::SUNPKLO: return "AArch64ISD::SUNPKLO"; + case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI"; + case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO"; } return nullptr; } @@ -1263,9 +1336,9 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, DebugLoc DL = MI.getDebugLoc(); MachineFunction::iterator It = ++MBB->getIterator(); - unsigned DestReg = MI.getOperand(0).getReg(); - unsigned IfTrueReg = MI.getOperand(1).getReg(); - unsigned IfFalseReg = MI.getOperand(2).getReg(); + Register DestReg = MI.getOperand(0).getReg(); + Register IfTrueReg = MI.getOperand(1).getReg(); + Register IfFalseReg = MI.getOperand(2).getReg(); unsigned CondCode = MI.getOperand(3).getImm(); bool NZCVKilled = MI.getOperand(4).isKill(); @@ -2140,7 +2213,8 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); - return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first; } // Returns true if the given Op is the overflow flag result of an overflow @@ -2349,7 +2423,8 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, // precise. That doesn't take part in the LibCall so we can't directly use // LowerF128Call. SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, CallOptions, SDLoc(Op)).first; } @@ -2419,7 +2494,8 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); - return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first; + MakeLibCallOptions CallOptions; + return makeLibCall(DAG, LC, Op.getValueType(), Ops, CallOptions, SDLoc(Op)).first; } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -2773,6 +2849,19 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_sve_sunpkhi: + return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_sunpklo: + return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_uunpkhi: + return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_uunpklo: + return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::localaddress: { const auto &MF = DAG.getMachineFunction(); const auto *RegInfo = Subtarget->getRegisterInfo(); @@ -2937,6 +3026,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::SPLAT_VECTOR: + return LowerSPLAT_VECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::SRA: @@ -3014,8 +3105,11 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, return CC_AArch64_Win64_VarArg; if (!Subtarget->isTargetDarwin()) return CC_AArch64_AAPCS; - return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; - case CallingConv::Win64: + if (!IsVarArg) + return CC_AArch64_DarwinPCS; + return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg + : CC_AArch64_DarwinPCS_VarArg; + case CallingConv::Win64: return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS; case CallingConv::AArch64_VectorCall: return CC_AArch64_AAPCS; @@ -3038,6 +3132,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; + DenseMap<unsigned, SDValue> CopiedRegs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); @@ -3094,11 +3189,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments( continue; } + SDValue ArgValue; if (VA.isRegLoc()) { // Arguments stored in registers. EVT RegVT = VA.getLocVT(); - - SDValue ArgValue; const TargetRegisterClass *RC; if (RegVT == MVT::i32) @@ -3113,6 +3207,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments( RC = &AArch64::FPR64RegClass; else if (RegVT == MVT::f128 || RegVT.is128BitVector()) RC = &AArch64::FPR128RegClass; + else if (RegVT.isScalableVector() && + RegVT.getVectorElementType() == MVT::i1) + RC = &AArch64::PPRRegClass; + else if (RegVT.isScalableVector()) + RC = &AArch64::ZPRRegClass; else llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); @@ -3128,20 +3227,23 @@ SDValue AArch64TargetLowering::LowerFormalArguments( llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; + case CCValAssign::Indirect: + assert(VA.getValVT().isScalableVector() && + "Only scalable vectors can be passed indirectly"); + llvm_unreachable("Spilling of SVE vectors not yet implemented"); case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); break; case CCValAssign::AExt: case CCValAssign::SExt: case CCValAssign::ZExt: - // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt - // nodes after our lowering. - assert(RegVT == Ins[i].VT && "incorrect register location selected"); + break; + case CCValAssign::AExtUpper: + ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue, + DAG.getConstant(32, DL, RegVT)); + ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT()); break; } - - InVals.push_back(ArgValue); - } else { // VA.isRegLoc() assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); unsigned ArgOffset = VA.getLocMemOffset(); @@ -3156,7 +3258,6 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - SDValue ArgValue; // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; @@ -3165,9 +3266,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments( switch (VA.getLocInfo()) { default: break; + case CCValAssign::Trunc: case CCValAssign::BCvt: MemVT = VA.getLocVT(); break; + case CCValAssign::Indirect: + assert(VA.getValVT().isScalableVector() && + "Only scalable vectors can be passed indirectly"); + llvm_unreachable("Spilling of SVE vectors not yet implemented"); case CCValAssign::SExt: ExtType = ISD::SEXTLOAD; break; @@ -3184,8 +3290,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments( MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), MemVT); - InVals.push_back(ArgValue); } + if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer()) + ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(), + ArgValue, DAG.getValueType(MVT::i32)); + InVals.push_back(ArgValue); } // varargs @@ -3202,8 +3311,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // This will point to the next argument passed via stack. unsigned StackOffset = CCInfo.getNextStackOffset(); - // We currently pass all varargs at 8-byte alignment. - StackOffset = ((StackOffset + 7) & ~7); + // We currently pass all varargs at 8-byte alignment, or 4 for ILP32 + StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8); FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); if (MFI.hasMustTailInVarArgFunc()) { @@ -3233,8 +3342,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( assert(!FuncInfo->getSRetReturnReg()); MVT PtrTy = getPointerTy(DAG.getDataLayout()); - unsigned Reg = - MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); + Register Reg = + MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]); @@ -3366,6 +3475,7 @@ SDValue AArch64TargetLowering::LowerCallResult( : RetCC_AArch64_AAPCS; // Assign locations to each value returned by this call. SmallVector<CCValAssign, 16> RVLocs; + DenseMap<unsigned, SDValue> CopiedRegs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC); @@ -3383,10 +3493,16 @@ SDValue AArch64TargetLowering::LowerCallResult( continue; } - SDValue Val = - DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); - Chain = Val.getValue(1); - InFlag = Val.getValue(2); + // Avoid copying a physreg twice since RegAllocFast is incompetent and only + // allows one use of a physreg per block. + SDValue Val = CopiedRegs.lookup(VA.getLocReg()); + if (!Val) { + Val = + DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); + Chain = Val.getValue(1); + InFlag = Val.getValue(2); + CopiedRegs[VA.getLocReg()] = Val; + } switch (VA.getLocInfo()) { default: @@ -3396,6 +3512,15 @@ SDValue AArch64TargetLowering::LowerCallResult( case CCValAssign::BCvt: Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); break; + case CCValAssign::AExtUpper: + Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val, + DAG.getConstant(32, DL, VA.getLocVT())); + LLVM_FALLTHROUGH; + case CCValAssign::AExt: + LLVM_FALLTHROUGH; + case CCValAssign::ZExt: + Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT()); + break; } InVals.push_back(Val); @@ -3593,6 +3718,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool IsVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); + MachineFunction::CallSiteInfo CSInfo; bool IsThisReturn = false; AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); @@ -3709,6 +3835,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, getPointerTy(DAG.getDataLayout())); SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallSet<unsigned, 8> RegsUsed; SmallVector<SDValue, 8> MemOpChains; auto PtrVT = getPointerTy(DAG.getDataLayout()); @@ -3716,7 +3843,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, const auto &Forwards = FuncInfo->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT); - RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); + RegsToPass.emplace_back(F.PReg, Val); } } @@ -3747,12 +3874,25 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); break; + case CCValAssign::AExtUpper: + assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); + Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); + Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, + DAG.getConstant(32, DL, VA.getLocVT())); + break; case CCValAssign::BCvt: - Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); + Arg = DAG.getBitcast(VA.getLocVT(), Arg); + break; + case CCValAssign::Trunc: + Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); break; case CCValAssign::FPExt: Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); break; + case CCValAssign::Indirect: + assert(VA.getValVT().isScalableVector() && + "Only scalable vectors can be passed indirectly"); + llvm_unreachable("Spilling of SVE vectors not yet implemented"); } if (VA.isRegLoc()) { @@ -3764,7 +3904,33 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, "unexpected use of 'returned'"); IsThisReturn = true; } - RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + if (RegsUsed.count(VA.getLocReg())) { + // If this register has already been used then we're trying to pack + // parts of an [N x i32] into an X-register. The extension type will + // take care of putting the two halves in the right place but we have to + // combine them. + SDValue &Bits = + std::find_if(RegsToPass.begin(), RegsToPass.end(), + [=](const std::pair<unsigned, SDValue> &Elt) { + return Elt.first == VA.getLocReg(); + }) + ->second; + Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); + // Call site info is used for function's parameter entry value + // tracking. For now we track only simple cases when parameter + // is transferred through whole register. + CSInfo.erase(std::remove_if(CSInfo.begin(), CSInfo.end(), + [&VA](MachineFunction::ArgRegPair ArgReg) { + return ArgReg.Reg == VA.getLocReg(); + }), + CSInfo.end()); + } else { + RegsToPass.emplace_back(VA.getLocReg(), Arg); + RegsUsed.insert(VA.getLocReg()); + const TargetOptions &Options = DAG.getTarget().Options; + if (Options.EnableDebugEntryValues) + CSInfo.emplace_back(VA.getLocReg(), i); + } } else { assert(VA.isMemLoc()); @@ -3899,6 +4065,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Ops.push_back(DAG.getRegister(RegToPass.first, RegToPass.second.getValueType())); + // Check callee args/returns for SVE registers and set calling convention + // accordingly. + if (CallConv == CallingConv::C) { + bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){ + return Out.VT.isScalableVector(); + }); + bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){ + return In.VT.isScalableVector(); + }); + + if (CalleeInSVE || CalleeOutSVE) + CallConv = CallingConv::AArch64_SVE_VectorCall; + } + // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask; const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); @@ -3930,12 +4110,15 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // actual call instruction. if (IsTailCall) { MF.getFrameInfo().setHasTailCall(); - return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); + SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); + DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); + return Ret; } // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); InFlag = Chain.getValue(1); + DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; @@ -3983,7 +4166,8 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // Copy the result values into the output registers. SDValue Flag; - SmallVector<SDValue, 4> RetOps(1, Chain); + SmallVector<std::pair<unsigned, SDValue>, 4> RetVals; + SmallSet<unsigned, 4> RegsUsed; for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); ++i, ++realRVLocIdx) { CCValAssign &VA = RVLocs[i]; @@ -4005,11 +4189,38 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); break; + case CCValAssign::AExt: + case CCValAssign::ZExt: + Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); + break; + case CCValAssign::AExtUpper: + assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits"); + Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT()); + Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg, + DAG.getConstant(32, DL, VA.getLocVT())); + break; } - Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); + if (RegsUsed.count(VA.getLocReg())) { + SDValue &Bits = + std::find_if(RetVals.begin(), RetVals.end(), + [=](const std::pair<unsigned, SDValue> &Elt) { + return Elt.first == VA.getLocReg(); + }) + ->second; + Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg); + } else { + RetVals.emplace_back(VA.getLocReg(), Arg); + RegsUsed.insert(VA.getLocReg()); + } + } + + SmallVector<SDValue, 4> RetOps(1, Chain); + for (auto &RetVal : RetVals) { + Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag); Flag = Chain.getValue(1); - RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + RetOps.push_back( + DAG.getRegister(RetVal.first, RetVal.second.getValueType())); } // Windows AArch64 ABIs require that for returning structs by value we copy @@ -4139,8 +4350,7 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = GN->getGlobal(); - unsigned char OpFlags = - Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); + unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); if (OpFlags != AArch64II::MO_NO_FLAG) assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && @@ -4204,6 +4414,7 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, SDLoc DL(Op); MVT PtrVT = getPointerTy(DAG.getDataLayout()); + MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout()); const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); SDValue TLVPAddr = @@ -4214,13 +4425,15 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, // to obtain the address of the variable. SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = DAG.getLoad( - MVT::i64, DL, Chain, DescAddr, + PtrMemVT, DL, Chain, DescAddr, MachinePointerInfo::getGOT(DAG.getMachineFunction()), - /* Alignment = */ 8, - MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant | - MachineMemOperand::MODereferenceable); + /* Alignment = */ PtrMemVT.getSizeInBits() / 8, + MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable); Chain = FuncTLVGet.getValue(1); + // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer. + FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT); + MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setAdjustsStack(true); @@ -4470,7 +4683,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // value of a libcall against zero, which is just what the rest of LowerBR_CC // is expecting to deal with. if (LHS.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. @@ -4736,7 +4949,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Handle f128 first, since one possible outcome is a normal integer // comparison which gets picked up by the next if statement. if (LHS.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); // If softenSetCCOperands returned a scalar, use it. if (!RHS.getNode()) { @@ -4798,7 +5011,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. if (LHS.getValueType() == MVT::f128) { - softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); + softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS); // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. @@ -5096,6 +5309,7 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, SDLoc DL(Op); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy(DAG.getDataLayout())); + FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout())); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV)); @@ -5202,15 +5416,15 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single // pointer. SDLoc DL(Op); - unsigned VaListSize = - Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32; + unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8; + unsigned VaListSize = (Subtarget->isTargetDarwin() || + Subtarget->isTargetWindows()) ? PtrSize : 32; const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); - return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), - Op.getOperand(2), - DAG.getConstant(VaListSize, DL, MVT::i32), - 8, false, false, false, MachinePointerInfo(DestSV), + return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(VaListSize, DL, MVT::i32), PtrSize, + false, false, false, MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); } @@ -5224,12 +5438,15 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Addr = Op.getOperand(1); unsigned Align = Op.getConstantOperandVal(3); + unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8; auto PtrVT = getPointerTy(DAG.getDataLayout()); - - SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V)); + auto PtrMemVT = getPointerMemTy(DAG.getDataLayout()); + SDValue VAList = + DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V)); Chain = VAList.getValue(1); + VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT); - if (Align > 8) { + if (Align > MinSlotSize) { assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(Align - 1, DL, PtrVT)); @@ -5238,14 +5455,14 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { } Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); - uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); + unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); // Scalar integer and FP values smaller than 64 bits are implicitly extended // up to 64 bits. At the very least, we have to increase the striding of the // vaargs list to match this, and for FP values we need to introduce // FP_ROUND nodes as well. if (VT.isInteger() && !VT.isVector()) - ArgSize = 8; + ArgSize = std::max(ArgSize, MinSlotSize); bool NeedFPTrunc = false; if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { ArgSize = 8; @@ -5255,6 +5472,8 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { // Increment the pointer, VAList, to the next vaarg SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(ArgSize, DL, PtrVT)); + VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT); + // Store the incremented VAList to the legalized pointer SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V)); @@ -5284,10 +5503,15 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SDLoc DL(Op); unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); SDValue FrameAddr = - DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); + DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64); while (Depth--) FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, MachinePointerInfo()); + + if (Subtarget->isTargetILP32()) + FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr, + DAG.getValueType(VT)); + return FrameAddr; } @@ -5306,9 +5530,9 @@ SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op, // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. -unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const { - unsigned Reg = MatchRegisterName(RegName); +Register AArch64TargetLowering:: +getRegisterByName(const char* RegName, EVT VT, const MachineFunction &MF) const { + Register Reg = MatchRegisterName(RegName); if (AArch64::X1 <= Reg && Reg <= AArch64::X28) { const MCRegisterInfo *MRI = Subtarget->getRegisterInfo(); unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false); @@ -5653,6 +5877,21 @@ const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { return "r"; } +enum PredicateConstraint { + Upl, + Upa, + Invalid +}; + +static PredicateConstraint parsePredicateConstraint(StringRef Constraint) { + PredicateConstraint P = PredicateConstraint::Invalid; + if (Constraint == "Upa") + P = PredicateConstraint::Upa; + if (Constraint == "Upl") + P = PredicateConstraint::Upl; + return P; +} + /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. AArch64TargetLowering::ConstraintType @@ -5661,19 +5900,30 @@ AArch64TargetLowering::getConstraintType(StringRef Constraint) const { switch (Constraint[0]) { default: break; - case 'z': - return C_Other; case 'x': case 'w': + case 'y': return C_RegisterClass; // An address with a single base register. Due to the way we // currently handle addresses it is the same as 'r'. case 'Q': return C_Memory; + case 'I': + case 'J': + case 'K': + case 'L': + case 'M': + case 'N': + case 'Y': + case 'Z': + return C_Immediate; + case 'z': case 'S': // A symbolic address return C_Other; } - } + } else if (parsePredicateConstraint(Constraint) != + PredicateConstraint::Invalid) + return C_RegisterClass; return TargetLowering::getConstraintType(Constraint); } @@ -5697,12 +5947,17 @@ AArch64TargetLowering::getSingleConstraintMatchWeight( break; case 'x': case 'w': + case 'y': if (type->isFloatingPointTy() || type->isVectorTy()) weight = CW_Register; break; case 'z': weight = CW_Constant; break; + case 'U': + if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid) + weight = CW_Register; + break; } return weight; } @@ -5719,6 +5974,8 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( case 'w': if (!Subtarget->hasFPARMv8()) break; + if (VT.isScalableVector()) + return std::make_pair(0U, &AArch64::ZPRRegClass); if (VT.getSizeInBits() == 16) return std::make_pair(0U, &AArch64::FPR16RegClass); if (VT.getSizeInBits() == 32) @@ -5733,9 +5990,25 @@ AArch64TargetLowering::getRegForInlineAsmConstraint( case 'x': if (!Subtarget->hasFPARMv8()) break; + if (VT.isScalableVector()) + return std::make_pair(0U, &AArch64::ZPR_4bRegClass); if (VT.getSizeInBits() == 128) return std::make_pair(0U, &AArch64::FPR128_loRegClass); break; + case 'y': + if (!Subtarget->hasFPARMv8()) + break; + if (VT.isScalableVector()) + return std::make_pair(0U, &AArch64::ZPR_3bRegClass); + break; + } + } else { + PredicateConstraint PC = parsePredicateConstraint(Constraint); + if (PC != PredicateConstraint::Invalid) { + assert(VT.isScalableVector()); + bool restricted = (PC == PredicateConstraint::Upl); + return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass) + : std::make_pair(0U, &AArch64::PPRRegClass); } } if (StringRef("{cc}").equals_lower(Constraint)) @@ -6279,6 +6552,8 @@ static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); + if (NumElts % 2 != 0) + return false; WhichResult = (M[0] == 0 ? 0 : 1); unsigned Idx = WhichResult * NumElts / 2; for (unsigned i = 0; i != NumElts; i += 2) { @@ -6446,8 +6721,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { if (!isConcatMask(Mask, VT, SplitV0)) return SDValue(); - EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - VT.getVectorNumElements() / 2); + EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); if (SplitV0) { V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, DAG.getConstant(0, DL, MVT::i64)); @@ -6790,6 +7064,41 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, return GenerateTBL(Op, ShuffleMask, DAG); } +SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + EVT VT = Op.getValueType(); + EVT ElemVT = VT.getScalarType(); + + SDValue SplatVal = Op.getOperand(0); + + // Extend input splat value where needed to fit into a GPR (32b or 64b only) + // FPRs don't have this restriction. + switch (ElemVT.getSimpleVT().SimpleTy) { + case MVT::i8: + case MVT::i16: + SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32); + break; + case MVT::i64: + SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); + break; + case MVT::i32: + // Fine as is + break; + // TODO: we can support splats of i1s and float types, but haven't added + // patterns yet. + case MVT::i1: + case MVT::f16: + case MVT::f32: + case MVT::f64: + default: + llvm_unreachable("Unsupported SPLAT_VECTOR input operand type"); + break; + } + + return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); +} + static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits) { EVT VT = BVN->getValueType(0); @@ -8063,7 +8372,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; - Info.align = 0; + Info.align.reset(); // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; @@ -8089,7 +8398,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; - Info.align = 0; + Info.align.reset(); // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; @@ -8101,7 +8410,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } @@ -8112,7 +8421,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; - Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } @@ -8122,7 +8431,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; - Info.align = 16; + Info.align = Align(16); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; case Intrinsic::aarch64_stlxp: @@ -8131,7 +8440,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(2); Info.offset = 0; - Info.align = 16; + Info.align = Align(16); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; default: @@ -8278,7 +8587,7 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { // Get the shift amount based on the scaling factor: // log2(sizeof(IdxTy)) - log2(8). uint64_t ShiftAmt = - countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3; + countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3; // Is the constant foldable in the shift of the addressing mode? // I.e., shift amount is between 1 and 4 inclusive. if (ShiftAmt == 0 || ShiftAmt > 4) @@ -8739,6 +9048,39 @@ EVT AArch64TargetLowering::getOptimalMemOpType( return MVT::Other; } +LLT AArch64TargetLowering::getOptimalMemOpLLT( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { + bool CanImplicitFloat = + !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); + bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; + bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; + // Only use AdvSIMD to implement memset of 32-byte and above. It would have + // taken one instruction to materialize the v2i64 zero and one store (with + // restrictive addressing mode). Just do i64 stores. + bool IsSmallMemset = IsMemset && Size < 32; + auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) { + if (memOpAlign(SrcAlign, DstAlign, AlignCheck)) + return true; + bool Fast; + return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone, + &Fast) && + Fast; + }; + + if (CanUseNEON && IsMemset && !IsSmallMemset && + AlignmentIsAcceptable(MVT::v2i64, 16)) + return LLT::vector(2, 64); + if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16)) + return LLT::scalar(128); + if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8)) + return LLT::scalar(64); + if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4)) + return LLT::scalar(32); + return LLT(); +} + // 12-bit optionally shifted immediates are legal for adds. bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { if (Immed == std::numeric_limits<int64_t>::min()) { @@ -10065,6 +10407,14 @@ static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { Opcode = AArch64ISD::SQSHLU_I; IsRightShift = false; break; + case Intrinsic::aarch64_neon_sshl: + case Intrinsic::aarch64_neon_ushl: + // For positive shift amounts we can use SHL, as ushl/sshl perform a regular + // left shift for positive shift amounts. Below, we only replace the current + // node with VSHL, if this condition is met. + Opcode = AArch64ISD::VSHL; + IsRightShift = false; + break; } if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { @@ -10151,6 +10501,8 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_sqshlu: case Intrinsic::aarch64_neon_srshl: case Intrinsic::aarch64_neon_urshl: + case Intrinsic::aarch64_neon_sshl: + case Intrinsic::aarch64_neon_ushl: return tryCombineShiftImm(IID, N, DAG); case Intrinsic::aarch64_crc32b: case Intrinsic::aarch64_crc32cb: @@ -10482,10 +10834,10 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return ReplacedSplat; SDLoc DL(S); - unsigned NumElts = VT.getVectorNumElements() / 2; + // Split VT into two. - EVT HalfVT = - EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); + EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); + unsigned NumElts = HalfVT.getVectorNumElements(); SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, DAG.getConstant(0, DL, MVT::i64)); SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, @@ -10567,7 +10919,7 @@ static SDValue performPostLD1Combine(SDNode *N, // are predecessors to each other or the Vector. SmallPtrSet<const SDNode *, 32> Visited; SmallVector<const SDNode *, 16> Worklist; - Visited.insert(N); + Visited.insert(Addr.getNode()); Worklist.push_back(User); Worklist.push_back(LD); Worklist.push_back(Vector.getNode()); @@ -11983,6 +12335,27 @@ bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( return Mask->getValue().isPowerOf2(); } +bool AArch64TargetLowering:: + shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( + SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, + unsigned OldShiftOpcode, unsigned NewShiftOpcode, + SelectionDAG &DAG) const { + // Does baseline recommend not to perform the fold by default? + if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( + X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG)) + return false; + // Else, if this is a vector shift, prefer 'shl'. + return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL; +} + +bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG, + SDNode *N) const { + if (DAG.getMachineFunction().getFunction().hasMinSize() && + !Subtarget->isTargetWindows()) + return false; + return true; +} + void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { // Update IsSplitCSR in AArch64unctionInfo. AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>(); @@ -12009,7 +12382,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR( else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); - unsigned NewVR = MRI->createVirtualRegister(RC); + Register NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be |