diff options
Diffstat (limited to 'llvm/lib/Target/ARM/ARMISelLowering.cpp')
| -rw-r--r-- | llvm/lib/Target/ARM/ARMISelLowering.cpp | 475 |
1 files changed, 346 insertions, 129 deletions
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 743cca9ff71f..2e78b52d0993 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -110,6 +110,7 @@ #include <cstdlib> #include <iterator> #include <limits> +#include <optional> #include <string> #include <tuple> #include <utility> @@ -1370,7 +1371,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // instructions. (ARMv6 doesn't have dmb, but it has an equivalent // encoding; see ARMISD::MEMBARRIER_MCR.) setMaxAtomicSizeInBitsSupported(64); - } else if (Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) { + } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) || + Subtarget->hasForced32BitAtomics()) { // Cortex-M (besides Cortex-M0) have 32-bit atomics. setMaxAtomicSizeInBitsSupported(32); } else { @@ -1379,6 +1381,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setMaxAtomicSizeInBitsSupported(0); } + setMaxDivRemBitWidthSupported(64); + setOperationAction(ISD::PREFETCH, MVT::Other, Custom); // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. @@ -1393,7 +1397,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR // iff target supports vfp2. setOperationAction(ISD::BITCAST, MVT::i64, Custom); - setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); + setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); } @@ -2627,11 +2631,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const TargetMachine &TM = getTargetMachine(); const Module *Mod = MF.getFunction().getParent(); - const GlobalValue *GV = nullptr; + const GlobalValue *GVal = nullptr; if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) - GV = G->getGlobal(); + GVal = G->getGlobal(); bool isStub = - !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); + !TM.shouldAssumeDSOLocal(*Mod, GVal) && Subtarget->isTargetMachO(); bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); bool isLocalARMFunc = false; @@ -2644,36 +2648,58 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // those, the target's already in a register, so we don't need to do // anything extra. if (isa<GlobalAddressSDNode>(Callee)) { - // Create a constant pool entry for the callee address - unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); - ARMConstantPoolValue *CPV = - ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); + // When generating execute-only code we use movw movt pair. + // Currently execute-only is only available for architectures that + // support movw movt, so we are safe to assume that. + if (Subtarget->genExecuteOnly()) { + assert(Subtarget->useMovt() && + "long-calls with execute-only requires movt and movw!"); + ++NumMovwMovt; + Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt, + DAG.getTargetGlobalAddress(GVal, dl, PtrVt)); + } else { + // Create a constant pool entry for the callee address + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create( + GVal, ARMPCLabelIndex, ARMCP::CPValue, 0); - // Get the address of the callee into a register - SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); - CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad( - PtrVt, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); + // Get the address of the callee into a register + SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); + Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr); + Callee = DAG.getLoad( + PtrVt, dl, DAG.getEntryNode(), Addr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); + } } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { const char *Sym = S->getSymbol(); - // Create a constant pool entry for the callee address - unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); - ARMConstantPoolValue *CPV = - ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, - ARMPCLabelIndex, 0); - // Get the address of the callee into a register - SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); - CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad( - PtrVt, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); + // When generating execute-only code we use movw movt pair. + // Currently execute-only is only available for architectures that + // support movw movt, so we are safe to assume that. + if (Subtarget->genExecuteOnly()) { + assert(Subtarget->useMovt() && + "long-calls with execute-only requires movt and movw!"); + ++NumMovwMovt; + Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt, + DAG.getTargetGlobalAddress(GVal, dl, PtrVt)); + } else { + // Create a constant pool entry for the callee address + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create( + *DAG.getContext(), Sym, ARMPCLabelIndex, 0); + + // Get the address of the callee into a register + SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4)); + Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr); + Callee = DAG.getLoad( + PtrVt, dl, DAG.getEntryNode(), Addr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); + } } } else if (isa<GlobalAddressSDNode>(Callee)) { if (!PreferIndirect) { isDirect = true; - bool isDef = GV->isStrongDefinitionForLinker(); + bool isDef = GVal->isStrongDefinitionForLinker(); // ARM call to a local ARM function is predicable. isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); @@ -2682,7 +2708,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); Callee = DAG.getNode( ARMISD::WrapperPIC, dl, PtrVt, - DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); + DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY)); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), Callee, MachinePointerInfo::getGOT(DAG.getMachineFunction()), MaybeAlign(), @@ -2692,11 +2718,11 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert(Subtarget->isTargetWindows() && "Windows is the only supported COFF target"); unsigned TargetFlags = ARMII::MO_NO_FLAG; - if (GV->hasDLLImportStorageClass()) + if (GVal->hasDLLImportStorageClass()) TargetFlags = ARMII::MO_DLLIMPORT; - else if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) + else if (!TM.shouldAssumeDSOLocal(*GVal->getParent(), GVal)) TargetFlags = ARMII::MO_COFFSTUB; - Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0, + Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0, TargetFlags); if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) Callee = @@ -2704,7 +2730,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), MachinePointerInfo::getGOT(DAG.getMachineFunction())); } else { - Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); + Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0); } } } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { @@ -2775,8 +2801,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // we've carefully laid out the parameters so that when sp is reset they'll be // in the correct location. if (isTailCall && !isSibCall) { - Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), - DAG.getIntPtrConstant(0, dl, true), InFlag, dl); + Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InFlag, dl); InFlag = Chain.getValue(1); } @@ -2837,9 +2862,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, uint64_t CalleePopBytes = canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL; - Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), - DAG.getIntPtrConstant(CalleePopBytes, dl, true), - InFlag, dl); + Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InFlag, dl); if (!Ins.empty()) InFlag = Chain.getValue(1); @@ -2915,7 +2938,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, int FI = std::numeric_limits<int>::max(); if (Arg.getOpcode() == ISD::CopyFromReg) { Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); - if (!Register::isVirtualRegister(VR)) + if (!VR.isVirtual()) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) @@ -3444,11 +3467,16 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, return LowerGlobalAddress(GA, DAG); } + // The 16-bit ADR instruction can only encode offsets that are multiples of 4, + // so we need to align to at least 4 bytes when we don't have 32-bit ADR. + Align CPAlign = CP->getAlign(); + if (Subtarget->isThumb1Only()) + CPAlign = std::max(CPAlign, Align(4)); if (CP->isMachineConstantPoolEntry()) Res = - DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign()); + DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign); else - Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign()); + Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign); return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); } @@ -4393,7 +4421,7 @@ void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, bool ARMTargetLowering::splitValueIntoRegisterParts( SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, - unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const { + unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const { bool IsABIRegCopy = CC.has_value(); EVT ValueVT = Val.getValueType(); if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && @@ -4411,7 +4439,7 @@ bool ARMTargetLowering::splitValueIntoRegisterParts( SDValue ARMTargetLowering::joinRegisterPartsIntoValue( SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, - MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const { + MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const { bool IsABIRegCopy = CC.has_value(); if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) { @@ -4480,7 +4508,7 @@ SDValue ARMTargetLowering::LowerFormalArguments( int lastInsIndex = -1; if (isVarArg && MFI.hasVAStart()) { unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); - if (RegIdx != array_lengthof(GPRArgRegs)) + if (RegIdx != std::size(GPRArgRegs)) ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); } @@ -6344,8 +6372,8 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, return DAG.getMergeValues(Ops, dl); } -SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, - SelectionDAG &DAG) const { +SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op, + SelectionDAG &DAG) const { // The rounding mode is in bits 23:22 of the FPSCR. // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) @@ -6738,23 +6766,23 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, if (ST->hasMVEFloatOps()) { Opc = ARMCC::NE; break; } else { - Invert = true; LLVM_FALLTHROUGH; + Invert = true; [[fallthrough]]; } case ISD::SETOEQ: case ISD::SETEQ: Opc = ARMCC::EQ; break; case ISD::SETOLT: - case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; + case ISD::SETLT: Swap = true; [[fallthrough]]; case ISD::SETOGT: case ISD::SETGT: Opc = ARMCC::GT; break; case ISD::SETOLE: - case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; + case ISD::SETLE: Swap = true; [[fallthrough]]; case ISD::SETOGE: case ISD::SETGE: Opc = ARMCC::GE; break; - case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; + case ISD::SETUGE: Swap = true; [[fallthrough]]; case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break; - case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; + case ISD::SETUGT: Swap = true; [[fallthrough]]; case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break; - case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; + case ISD::SETUEQ: Invert = true; [[fallthrough]]; case ISD::SETONE: { // Expand this to (OLT | OGT). SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, @@ -6766,7 +6794,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, Result = DAG.getNOT(dl, Result, VT); return Result; } - case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; + case ISD::SETUO: Invert = true; [[fallthrough]]; case ISD::SETO: { // Expand this to (OLT | OGE). SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0, @@ -6787,16 +6815,16 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, if (ST->hasMVEIntegerOps()) { Opc = ARMCC::NE; break; } else { - Invert = true; LLVM_FALLTHROUGH; + Invert = true; [[fallthrough]]; } case ISD::SETEQ: Opc = ARMCC::EQ; break; - case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; + case ISD::SETLT: Swap = true; [[fallthrough]]; case ISD::SETGT: Opc = ARMCC::GT; break; - case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; + case ISD::SETLE: Swap = true; [[fallthrough]]; case ISD::SETGE: Opc = ARMCC::GE; break; - case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; + case ISD::SETULT: Swap = true; [[fallthrough]]; case ISD::SETUGT: Opc = ARMCC::HI; break; - case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; + case ISD::SETULE: Swap = true; [[fallthrough]]; case ISD::SETUGE: Opc = ARMCC::HS; break; } @@ -6828,25 +6856,25 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, // If one of the operands is a constant vector zero, attempt to fold the // comparison to a specialized compare-against-zero form. - SDValue SingleOp; - if (ISD::isBuildVectorAllZeros(Op1.getNode())) - SingleOp = Op0; - else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { + if (ISD::isBuildVectorAllZeros(Op0.getNode()) && + (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ || + Opc == ARMCC::NE)) { if (Opc == ARMCC::GE) Opc = ARMCC::LE; else if (Opc == ARMCC::GT) Opc = ARMCC::LT; - SingleOp = Op1; + std::swap(Op0, Op1); } SDValue Result; - if (SingleOp.getNode()) { - Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, SingleOp, + if (ISD::isBuildVectorAllZeros(Op1.getNode()) && + (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE || + Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ)) + Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0, DAG.getConstant(Opc, dl, MVT::i32)); - } else { + else Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1, DAG.getConstant(Opc, dl, MVT::i32)); - } Result = DAG.getSExtOrTrunc(Result, dl, VT); @@ -7485,6 +7513,28 @@ static bool isReverseMask(ArrayRef<int> M, EVT VT) { return true; } +static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) { + unsigned NumElts = VT.getVectorNumElements(); + // Make sure the mask has the right size. + if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8)) + return false; + + // Half-width truncation patterns (e.g. v4i32 -> v8i16): + // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6> + // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14> + // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7> + // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15> + int Ofs = Top ? 1 : 0; + int Upper = SingleSource ? 0 : NumElts; + for (int i = 0, e = NumElts / 2; i != e; ++i) { + if (M[i] >= 0 && M[i] != (i * 2) + Ofs) + return false; + if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper) + return false; + } + return true; +} + static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) { unsigned NumElts = VT.getVectorNumElements(); // Make sure the mask has the right size. @@ -7678,10 +7728,9 @@ static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, // extend that single value SDValue FirstOp = Op.getOperand(0); if (!isa<ConstantSDNode>(FirstOp) && - std::all_of(std::next(Op->op_begin()), Op->op_end(), - [&FirstOp](SDUse &U) { - return U.get().isUndef() || U.get() == FirstOp; - })) { + llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) { + return U.get().isUndef() || U.get() == FirstOp; + })) { SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp, DAG.getValueType(MVT::i1)); return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext); @@ -8009,12 +8058,11 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); EVT ExtVT = VT.getVectorElementType(); EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); - SDValue Lower = - DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); + SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2)); if (Lower.getOpcode() == ISD::BUILD_VECTOR) Lower = LowerBUILD_VECTOR(Lower, DAG, ST); - SDValue Upper = DAG.getBuildVector( - HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); + SDValue Upper = + DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2)); if (Upper.getOpcode() == ISD::BUILD_VECTOR) Upper = LowerBUILD_VECTOR(Upper, DAG, ST); if (Lower && Upper) @@ -8339,6 +8387,11 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { (isVMOVNMask(M, VT, true, false) || isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true))) return true; + else if (Subtarget->hasMVEIntegerOps() && + (isTruncMask(M, VT, false, false) || + isTruncMask(M, VT, false, true) || + isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true))) + return true; else return false; } @@ -8367,15 +8420,13 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, default: llvm_unreachable("Unknown shuffle opcode!"); case OP_VREV: // VREV divides the vector in half and swaps within the half. - if (VT.getVectorElementType() == MVT::i32 || - VT.getVectorElementType() == MVT::f32) + if (VT.getScalarSizeInBits() == 32) return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); // vrev <4 x i16> -> VREV32 - if (VT.getVectorElementType() == MVT::i16 || - VT.getVectorElementType() == MVT::f16) + if (VT.getScalarSizeInBits() == 16) return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); // vrev <4 x i8> -> VREV16 - assert(VT.getVectorElementType() == MVT::i8); + assert(VT.getScalarSizeInBits() == 8); return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); case OP_VDUP0: case OP_VDUP1: @@ -8503,6 +8554,7 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, "No support for vector shuffle of boolean predicates"); SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); SDLoc dl(Op); if (isReverseMask(ShuffleMask, VT)) { SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1); @@ -8520,12 +8572,16 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, // many cases the generated code might be even better than scalar code // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit // fields in a register into 8 other arbitrary 2-bit fields! - SDValue PredAsVector = PromoteMVEPredVector(dl, V1, VT, DAG); - EVT NewVT = PredAsVector.getValueType(); + SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG); + EVT NewVT = PredAsVector1.getValueType(); + SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT) + : PromoteMVEPredVector(dl, V2, VT, DAG); + assert(PredAsVector2.getValueType() == NewVT && + "Expected identical vector type in expanded i1 shuffle!"); // Do the shuffle! - SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector, - DAG.getUNDEF(NewVT), ShuffleMask); + SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1, + PredAsVector2, ShuffleMask); // Now return the result of comparing the shuffled vector with zero, // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 @@ -8813,10 +8869,29 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, } } - if (ST->hasMVEIntegerOps() && EltSize <= 32) + if (ST->hasMVEIntegerOps() && EltSize <= 32) { if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG)) return V; + for (bool Top : {false, true}) { + for (bool SingleSource : {false, true}) { + if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) { + MVT FromSVT = MVT::getIntegerVT(EltSize * 2); + MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2); + SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1); + SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, + SingleSource ? V1 : V2); + if (Top) { + SDValue Amt = DAG.getConstant(EltSize, dl, FromVT); + Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt); + Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt); + } + return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi); + } + } + } + } + // If the shuffle is not directly supported and it has 4 elements, use // the PerfectShuffle-generated table to synthesize it from other shuffles. unsigned NumElts = VT.getVectorNumElements(); @@ -9015,7 +9090,7 @@ static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, // Extract the vector elements from Op1 and Op2 one by one and truncate them // to be the right size for the destination. For example, if Op1 is v4i1 - // then the promoted vector is v4i32. The result of concatentation gives a + // then the promoted vector is v4i32. The result of concatenation gives a // v8i1, which when promoted is v8i16. That means each i32 element from Op1 // needs truncating to i16 and inserting in the result. EVT ConcatVT = MVT::getVectorVT(ElType, NumElts); @@ -10391,7 +10466,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget); case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget); - case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); + case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG); case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::SDIV: @@ -12214,7 +12289,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, // Any ARM instruction that sets the 's' bit should specify an optional // "cc_out" operand in the last operand position. - if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { + if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) { assert(!NewOpc && "Optional cc_out operand required"); return; } @@ -12300,7 +12375,7 @@ static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, // (zext cc) can never be the all ones value. if (AllOnes) return false; - LLVM_FALLTHROUGH; + [[fallthrough]]; case ISD::SIGN_EXTEND: { SDLoc dl(N); EVT VT = N->getValueType(0); @@ -13706,7 +13781,7 @@ static SDValue PerformSHLSimplify(SDNode *N, return SDValue(); // Check that all the users could perform the shl themselves. - for (auto U : N->uses()) { + for (auto *U : N->uses()) { switch(U->getOpcode()) { default: return SDValue(); @@ -13748,10 +13823,13 @@ static SDValue PerformSHLSimplify(SDNode *N, APInt C2Int = C2->getAPIntValue(); APInt C1Int = C1ShlC2->getAPIntValue(); + unsigned C2Width = C2Int.getBitWidth(); + if (C2Int.uge(C2Width)) + return SDValue(); + uint64_t C2Value = C2Int.getZExtValue(); // Check that performing a lshr will not lose any information. - APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), - C2Int.getBitWidth() - C2->getZExtValue()); + APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value); if ((C1Int & Mask) != C1Int) return SDValue(); @@ -14676,7 +14754,7 @@ static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); unsigned LSB = countTrailingZeros(~InvMask); - unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; + unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB; assert(Width < static_cast<unsigned>(std::numeric_limits<unsigned>::digits) && "undefined behavior"); @@ -15823,7 +15901,7 @@ static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, Tys[n] = AlignedVecTy; Tys[n++] = MVT::i32; Tys[n] = MVT::Other; - SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); + SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2)); // Then, gather the new node's operands. SmallVector<SDValue, 8> Ops; @@ -16134,7 +16212,7 @@ static SDValue PerformMVEVLDCombine(SDNode *N, Tys[n] = VecTy; Tys[n++] = MVT::i32; Tys[n] = MVT::Other; - SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); + SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2)); // Then, gather the new node's operands. SmallVector<SDValue, 8> Ops; @@ -16215,7 +16293,7 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { for (n = 0; n < NumVecs; ++n) Tys[n] = VT; Tys[n] = MVT::Other; - SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); + SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1)); SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD); SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, @@ -16512,7 +16590,7 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SDValue Store = DAG.getTruncStore( Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), - NewToVT, Alignment.value(), MMOFlags, AAInfo); + NewToVT, Alignment, MMOFlags, AAInfo); Stores.push_back(Store); } return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); @@ -16553,7 +16631,7 @@ static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SDValue Extract = Trunc.getOperand(i); SDValue Store = DAG.getTruncStore( Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), - NewToVT, Alignment.value(), MMOFlags, AAInfo); + NewToVT, Alignment, MMOFlags, AAInfo); Stores.push_back(Store); } return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); @@ -16589,8 +16667,8 @@ static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG) { AAMDNodes AAInfo = St->getAAInfo(); EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits()); SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr, - St->getPointerInfo(), NewToVT, - Alignment.value(), MMOFlags, AAInfo); + St->getPointerInfo(), NewToVT, Alignment, + MMOFlags, AAInfo); return Store; } @@ -18333,7 +18411,7 @@ SDValue ARMTargetLowering::PerformMVETruncCombine( if (S0->getOperand(0) == S1->getOperand(0) && S0->getOperand(1) == S1->getOperand(1)) { // Construct complete shuffle mask - SmallVector<int, 8> Mask(S0->getMask().begin(), S0->getMask().end()); + SmallVector<int, 8> Mask(S0->getMask()); Mask.append(S1->getMask().begin(), S1->getMask().end()); if (isVMOVNTruncMask(Mask, VT, false)) @@ -18793,7 +18871,7 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags, - bool *Fast) const { + unsigned *Fast) const { // Depends what it gets converted into if the type is weird. if (!VT.isSimple()) return false; @@ -18817,7 +18895,7 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, // A big-endian target may also explicitly support unaligned accesses if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { if (Fast) - *Fast = true; + *Fast = 1; return true; } } @@ -18829,7 +18907,7 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 || Ty == MVT::v2i1)) { if (Fast) - *Fast = true; + *Fast = 1; return true; } @@ -18855,7 +18933,7 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 || Ty == MVT::v2f64) { if (Fast) - *Fast = true; + *Fast = 1; return true; } @@ -18868,7 +18946,7 @@ EVT ARMTargetLowering::getOptimalMemOpType( // See if we can use NEON instructions for this... if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() && !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) { - bool Fast; + unsigned Fast; if (Op.size() >= 16 && (Op.isAligned(Align(16)) || (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1), @@ -19147,18 +19225,6 @@ bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { return true; } -InstructionCost ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, - const AddrMode &AM, - Type *Ty, - unsigned AS) const { - if (isLegalAddressingMode(DL, AM, Ty, AS)) { - if (Subtarget->hasFPAO()) - return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster - return 0; - } - return -1; -} - /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be /// expanded to FMAs when this method returns true, otherwise fmuladd is @@ -20151,6 +20217,8 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint( case 'w': if (VT == MVT::Other) break; + if (VT == MVT::f16 || VT == MVT::bf16) + return RCPair(0U, &ARM::HPRRegClass); if (VT == MVT::f32) return RCPair(0U, &ARM::SPRRegClass); if (VT.getSizeInBits() == 64) @@ -20171,6 +20239,8 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint( case 't': if (VT == MVT::Other) break; + if (VT == MVT::f16 || VT == MVT::bf16) + return RCPair(0U, &ARM::HPRRegClass); if (VT == MVT::f32 || VT == MVT::i32) return RCPair(0U, &ARM::SPRRegClass); if (VT.getSizeInBits() == 64) @@ -20422,9 +20492,22 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { "Invalid opcode for Div/Rem lowering"); bool isSigned = (Opcode == ISD::SDIVREM); EVT VT = Op->getValueType(0); - Type *Ty = VT.getTypeForEVT(*DAG.getContext()); SDLoc dl(Op); + if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) { + SmallVector<SDValue> Result; + if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) { + SDValue Res0 = + DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]); + SDValue Res1 = + DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]); + return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), + {Res0, Res1}); + } + } + + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + // If the target has hardware divide, use divide + multiply + subtract: // div = a / b // rem = a - b * div @@ -20473,11 +20556,20 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { // Lowers REM using divmod helpers // see RTABI section 4.2/4.3 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { + EVT VT = N->getValueType(0); + + if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) { + SmallVector<SDValue> Result; + if (expandDIVREMByConstant(N, Result, MVT::i32, DAG)) + return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0), + Result[0], Result[1]); + } + // Build return types (div and rem) std::vector<Type*> RetTyParams; Type *RetTyElement; - switch (N->getValueType(0).getSimpleVT().SimpleTy) { + switch (VT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unexpected request for libcall!"); case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; @@ -20978,7 +21070,7 @@ Instruction *ARMTargetLowering::emitLeadingFence(IRBuilderBase &Builder, case AtomicOrdering::SequentiallyConsistent: if (!Inst->hasAtomicStore()) return nullptr; // Nothing to do - LLVM_FALLTHROUGH; + [[fallthrough]]; case AtomicOrdering::Release: case AtomicOrdering::AcquireRelease: if (Subtarget->preferISHSTBarriers()) @@ -21105,7 +21197,10 @@ bool ARMTargetLowering::shouldInsertFencesForAtomic( return InsertFencesForAtomic; } -bool ARMTargetLowering::useLoadStackGuardNode() const { return true; } +bool ARMTargetLowering::useLoadStackGuardNode() const { + // ROPI/RWPI are not supported currently. + return !Subtarget->isROPI() && !Subtarget->isRWPI(); +} void ARMTargetLowering::insertSSPDeclarations(Module &M) const { if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) @@ -21156,7 +21251,7 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, return false; assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); - unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedSize(); + unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue(); // We can do a store + vector extract on any vector that fits perfectly in a D // or Q register. if (BitWidth == 64 || BitWidth == 128) { @@ -21166,16 +21261,36 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, return false; } -bool ARMTargetLowering::isCheapToSpeculateCttz() const { +bool ARMTargetLowering::isCheapToSpeculateCttz(Type *Ty) const { return Subtarget->hasV6T2Ops(); } -bool ARMTargetLowering::isCheapToSpeculateCtlz() const { +bool ARMTargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { return Subtarget->hasV6T2Ops(); } -bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { - return !Subtarget->hasMinSize() || Subtarget->isTargetWindows(); +bool ARMTargetLowering::isMaskAndCmp0FoldingBeneficial( + const Instruction &AndI) const { + if (!Subtarget->hasV7Ops()) + return false; + + // Sink the `and` instruction only if the mask would fit into a modified + // immediate operand. + ConstantInt *Mask = dyn_cast<ConstantInt>(AndI.getOperand(1)); + if (!Mask || Mask->getValue().getBitWidth() > 32u) + return false; + auto MaskVal = unsigned(Mask->getValue().getZExtValue()); + return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal) + : ARM_AM::getSOImmVal(MaskVal)) != -1; +} + +TargetLowering::ShiftLegalizationStrategy +ARMTargetLowering::preferredShiftLegalizationStrategy( + SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const { + if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows()) + return ShiftLegalizationStrategy::LowerToLibcall; + return TargetLowering::preferredShiftLegalizationStrategy(DAG, N, + ExpansionFactor); } Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, @@ -21661,11 +21776,11 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, case HA_DOUBLE: return false; case HA_VECT64: - return VT->getPrimitiveSizeInBits().getFixedSize() == 64; + return VT->getPrimitiveSizeInBits().getFixedValue() == 64; case HA_VECT128: - return VT->getPrimitiveSizeInBits().getFixedSize() == 128; + return VT->getPrimitiveSizeInBits().getFixedValue() == 128; case HA_UNKNOWN: - switch (VT->getPrimitiveSizeInBits().getFixedSize()) { + switch (VT->getPrimitiveSizeInBits().getFixedValue()) { case 64: Base = HA_VECT64; return true; @@ -21777,3 +21892,105 @@ void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { MF.getFrameInfo().computeMaxCallFrameSize(MF); TargetLoweringBase::finalizeLowering(MF); } + +bool ARMTargetLowering::isComplexDeinterleavingSupported() const { + return Subtarget->hasMVEIntegerOps(); +} + +bool ARMTargetLowering::isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation Operation, Type *Ty) const { + auto *VTy = dyn_cast<FixedVectorType>(Ty); + if (!VTy) + return false; + + auto *ScalarTy = VTy->getScalarType(); + unsigned NumElements = VTy->getNumElements(); + + unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements; + if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth)) + return false; + + // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32 + if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy()) + return Subtarget->hasMVEFloatOps(); + + if (Operation != ComplexDeinterleavingOperation::CAdd) + return false; + + return Subtarget->hasMVEIntegerOps() && + (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) || + ScalarTy->isIntegerTy(32)); +} + +Value *ARMTargetLowering::createComplexDeinterleavingIR( + Instruction *I, ComplexDeinterleavingOperation OperationType, + ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, + Value *Accumulator) const { + + FixedVectorType *Ty = cast<FixedVectorType>(InputA->getType()); + + IRBuilder<> B(I); + + unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements(); + + assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits"); + + if (TyWidth > 128) { + int Stride = Ty->getNumElements() / 2; + auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements()); + auto SplitSeqVec = llvm::to_vector(SplitSeq); + ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride); + ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride); + + auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask); + auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask); + auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask); + auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask); + Value *LowerSplitAcc = nullptr; + Value *UpperSplitAcc = nullptr; + + if (Accumulator) { + LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask); + UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask); + } + + auto *LowerSplitInt = createComplexDeinterleavingIR( + I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); + auto *UpperSplitInt = createComplexDeinterleavingIR( + I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); + + ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements()); + return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask); + } + + auto *IntTy = Type::getInt32Ty(B.getContext()); + + ConstantInt *ConstRotation = nullptr; + if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { + ConstRotation = ConstantInt::get(IntTy, (int)Rotation); + + if (Accumulator) + return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty, + {ConstRotation, Accumulator, InputB, InputA}); + return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty, + {ConstRotation, InputB, InputA}); + } + + if (OperationType == ComplexDeinterleavingOperation::CAdd) { + // 1 means the value is not halved. + auto *ConstHalving = ConstantInt::get(IntTy, 1); + + if (Rotation == ComplexDeinterleavingRotation::Rotation_90) + ConstRotation = ConstantInt::get(IntTy, 0); + else if (Rotation == ComplexDeinterleavingRotation::Rotation_270) + ConstRotation = ConstantInt::get(IntTy, 1); + + if (!ConstRotation) + return nullptr; // Invalid rotation for arm_mve_vcaddq + + return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty, + {ConstHalving, ConstRotation, InputA, InputB}); + } + + return nullptr; +} |
