diff options
Diffstat (limited to 'lib/Target/ARM/ARMISelLowering.cpp')
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.cpp | 1556 |
1 files changed, 1129 insertions, 427 deletions
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 21de0f6a7630..18bb9bf3eccc 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -1,9 +1,8 @@ //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -80,6 +79,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -113,6 +113,7 @@ #include <vector> using namespace llvm; +using namespace llvm::PatternMatch; #define DEBUG_TYPE "arm-isel" @@ -220,6 +221,121 @@ void ARMTargetLowering::addQRTypeForNEON(MVT VT) { addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); } +void ARMTargetLowering::setAllExpand(MVT VT) { + for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) + setOperationAction(Opc, VT, Expand); + + // We support these really simple operations even on types where all + // the actual arithmetic has to be broken down into simpler + // operations or turned into library calls. + setOperationAction(ISD::BITCAST, VT, Legal); + setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::STORE, VT, Legal); + setOperationAction(ISD::UNDEF, VT, Legal); +} + +void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To, + LegalizeAction Action) { + setLoadExtAction(ISD::EXTLOAD, From, To, Action); + setLoadExtAction(ISD::ZEXTLOAD, From, To, Action); + setLoadExtAction(ISD::SEXTLOAD, From, To, Action); +} + +void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { + const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; + + for (auto VT : IntTypes) { + addRegisterClass(VT, &ARM::QPRRegClass); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SMIN, VT, Legal); + setOperationAction(ISD::SMAX, VT, Legal); + setOperationAction(ISD::UMIN, VT, Legal); + setOperationAction(ISD::UMAX, VT, Legal); + setOperationAction(ISD::ABS, VT, Legal); + + // No native support for these. + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + + if (!HasMVEFP) { + setOperationAction(ISD::SINT_TO_FP, VT, Expand); + setOperationAction(ISD::UINT_TO_FP, VT, Expand); + setOperationAction(ISD::FP_TO_SINT, VT, Expand); + setOperationAction(ISD::FP_TO_UINT, VT, Expand); + } + } + + const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; + for (auto VT : FloatTypes) { + addRegisterClass(VT, &ARM::QPRRegClass); + if (!HasMVEFP) + setAllExpand(VT); + + // These are legal or custom whether we have MVE.fp or not + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getVectorElementType(), Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal); + + if (HasMVEFP) { + setOperationAction(ISD::FMINNUM, VT, Legal); + setOperationAction(ISD::FMAXNUM, VT, Legal); + setOperationAction(ISD::FROUND, VT, Legal); + + // No native support for these. + setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FSQRT, VT, Expand); + setOperationAction(ISD::FSIN, VT, Expand); + setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG2, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); + setOperationAction(ISD::FEXP, VT, Expand); + setOperationAction(ISD::FEXP2, VT, Expand); + setOperationAction(ISD::FNEARBYINT, VT, Expand); + } + } + + // We 'support' these types up to bitcast/load/store level, regardless of + // MVE integer-only / float support. Only doing FP data processing on the FP + // vector types is inhibited at integer-only level. + const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 }; + for (auto VT : LongTypes) { + addRegisterClass(VT, &ARM::QPRRegClass); + setAllExpand(VT); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + } + // We can do bitwise operations on v2i64 vectors + setOperationAction(ISD::AND, MVT::v2i64, Legal); + setOperationAction(ISD::OR, MVT::v2i64, Legal); + setOperationAction(ISD::XOR, MVT::v2i64, Legal); + + // It is legal to extload from v4i8 to v4i16 or v4i32. + addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal); + addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal); + addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal); + + // Some truncating stores are legal too. + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); +} + ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -240,7 +356,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (Subtarget->isTargetMachO()) { // Uses VFP for Thumb libfuncs if available. - if (Subtarget->isThumb() && Subtarget->hasVFP2() && + if (Subtarget->isThumb() && Subtarget->hasVFP2Base() && Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { static const struct { const RTLIB::Libcall Op; @@ -509,10 +625,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, else addRegisterClass(MVT::i32, &ARM::GPRRegClass); - if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && - !Subtarget->isThumb1Only()) { + if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() && + Subtarget->hasFPRegs()) { addRegisterClass(MVT::f32, &ARM::SPRRegClass); addRegisterClass(MVT::f64, &ARM::DPRRegClass); + if (!Subtarget->hasVFP2Base()) + setAllExpand(MVT::f32); + if (!Subtarget->hasFP64()) + setAllExpand(MVT::f64); } if (Subtarget->hasFullFP16()) { @@ -528,9 +648,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, for (MVT VT : MVT::vector_valuetypes()) { for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); - setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); - setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); - setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); + addAllExtLoads(VT, InnerVT, Expand); } setOperationAction(ISD::MULHS, VT, Expand); @@ -547,6 +665,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); + if (Subtarget->hasMVEIntegerOps()) + addMVEVectorTypes(Subtarget->hasMVEFloatOps()); + + // Combine low-overhead loop intrinsics so that we can lower i1 types. + if (Subtarget->hasLOB()) + setTargetDAGCombine(ISD::BRCOND); + if (Subtarget->hasNEON()) { addDRTypeForNEON(MVT::v2f32); addDRTypeForNEON(MVT::v8i8); @@ -565,11 +690,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, addQRTypeForNEON(MVT::v8f16); addDRTypeForNEON(MVT::v4f16); } + } + if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { // v2f64 is legal so that QR subregs can be extracted as f64 elements, but - // neither Neon nor VFP support any arithmetic operations on it. - // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively - // supported for v4f32. + // none of Neon, MVE or VFP supports any arithmetic operations on it. setOperationAction(ISD::FADD, MVT::v2f64, Expand); setOperationAction(ISD::FSUB, MVT::v2f64, Expand); setOperationAction(ISD::FMUL, MVT::v2f64, Expand); @@ -603,7 +728,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); setOperationAction(ISD::FMA, MVT::v2f64, Expand); + } + if (Subtarget->hasNEON()) { + // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively + // supported for v4f32. setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); setOperationAction(ISD::FSIN, MVT::v4f32, Expand); setOperationAction(ISD::FCOS, MVT::v4f32, Expand); @@ -697,7 +826,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); // NEON only has FMA instructions as of VFP4. - if (!Subtarget->hasVFP4()) { + if (!Subtarget->hasVFP4Base()) { setOperationAction(ISD::FMA, MVT::v2f32, Expand); setOperationAction(ISD::FMA, MVT::v4f32, Expand); } @@ -711,9 +840,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); - setTargetDAGCombine(ISD::BUILD_VECTOR); - setTargetDAGCombine(ISD::VECTOR_SHUFFLE); - setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); @@ -731,7 +857,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } } - if (Subtarget->isFPOnlySP()) { + if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { + setTargetDAGCombine(ISD::BUILD_VECTOR); + setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + } + + if (!Subtarget->hasFP64()) { // When targeting a floating-point unit with only single-precision // operations, f64 is legal for the few double-precision instructions which // are present However, no double-precision operations other than moves, @@ -767,9 +899,19 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); + } + + if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()){ setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); + setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); } + if (!Subtarget->hasFP16()) + setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); + + if (!Subtarget->hasFP64()) + setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); + computeRegisterProperties(Subtarget->getRegisterInfo()); // ARM does not have floating-point extending loads. @@ -832,6 +974,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SRA, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); + // MVE lowers 64 bit shifts to lsll and lsrl + // assuming that ISD::SRL and SRA of i64 are already marked custom + if (Subtarget->hasMVEIntegerOps()) + setOperationAction(ISD::SHL, MVT::i64, Custom); + // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. if (Subtarget->isThumb1Only()) { setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); @@ -1029,7 +1176,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && + if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() && !Subtarget->isThumb1Only()) { // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR // iff target supports vfp2. @@ -1079,7 +1226,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSINCOS, MVT::f32, Expand); setOperationAction(ISD::FREM, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f32, Expand); - if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && + if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only()) { setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); @@ -1087,7 +1234,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FPOW, MVT::f32, Expand); - if (!Subtarget->hasVFP4()) { + if (!Subtarget->hasVFP4Base()) { setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); } @@ -1095,7 +1242,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // Various VFP goodness if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. - if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) { + if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) { setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); } @@ -1115,7 +1262,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } // FP-ARMv8 implements a lot of rounding-like FP operations. - if (Subtarget->hasFPARMv8()) { + if (Subtarget->hasFPARMv8Base()) { setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FROUND, MVT::f32, Legal); @@ -1124,12 +1271,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FRINT, MVT::f32, Legal); setOperationAction(ISD::FMINNUM, MVT::f32, Legal); setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); - setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); - setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); - setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); - setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); + if (Subtarget->hasNEON()) { + setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); + } - if (!Subtarget->isFPOnlySP()) { + if (Subtarget->hasFP64()) { setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FROUND, MVT::f64, Legal); @@ -1141,6 +1290,24 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } } + // FP16 often need to be promoted to call lib functions + if (Subtarget->hasFullFP16()) { + setOperationAction(ISD::FREM, MVT::f16, Promote); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); + setOperationAction(ISD::FSIN, MVT::f16, Promote); + setOperationAction(ISD::FCOS, MVT::f16, Promote); + setOperationAction(ISD::FSINCOS, MVT::f16, Promote); + setOperationAction(ISD::FPOWI, MVT::f16, Promote); + setOperationAction(ISD::FPOW, MVT::f16, Promote); + setOperationAction(ISD::FEXP, MVT::f16, Promote); + setOperationAction(ISD::FEXP2, MVT::f16, Promote); + setOperationAction(ISD::FLOG, MVT::f16, Promote); + setOperationAction(ISD::FLOG10, MVT::f16, Promote); + setOperationAction(ISD::FLOG2, MVT::f16, Promote); + + setOperationAction(ISD::FROUND, MVT::f16, Legal); + } + if (Subtarget->hasNEON()) { // vmin and vmax aren't available in a scalar form, so we use // a NEON instruction with an undef lane instead. @@ -1177,11 +1344,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (Subtarget->hasV6Ops()) setTargetDAGCombine(ISD::SRL); + if (Subtarget->isThumb1Only()) + setTargetDAGCombine(ISD::SHL); setStackPointerRegisterToSaveRestore(ARM::SP); if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || - !Subtarget->hasVFP2()) + !Subtarget->hasVFP2Base() || Subtarget->hasMinSize()) setSchedulingPreference(Sched::RegPressure); else setSchedulingPreference(Sched::Hybrid); @@ -1204,6 +1373,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setPrefLoopAlignment(Subtarget->getPrefLoopAlignment()); setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); + + if (Subtarget->isThumb() || Subtarget->isThumb2()) + setTargetDAGCombine(ISD::ABS); } bool ARMTargetLowering::useSoftFloat() const { @@ -1288,6 +1460,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::SSAT: return "ARMISD::SSAT"; case ARMISD::USAT: return "ARMISD::USAT"; + case ARMISD::ASRL: return "ARMISD::ASRL"; + case ARMISD::LSRL: return "ARMISD::LSRL"; + case ARMISD::LSLL: return "ARMISD::LSLL"; + case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; case ARMISD::RRX: return "ARMISD::RRX"; @@ -1332,23 +1508,25 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VCGTU: return "ARMISD::VCGTU"; case ARMISD::VTST: return "ARMISD::VTST"; - case ARMISD::VSHL: return "ARMISD::VSHL"; - case ARMISD::VSHRs: return "ARMISD::VSHRs"; - case ARMISD::VSHRu: return "ARMISD::VSHRu"; - case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; - case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; - case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; - case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; - case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; - case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; - case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; - case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; - case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; - case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; - case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; - case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; - case ARMISD::VSLI: return "ARMISD::VSLI"; - case ARMISD::VSRI: return "ARMISD::VSRI"; + case ARMISD::VSHLs: return "ARMISD::VSHLs"; + case ARMISD::VSHLu: return "ARMISD::VSHLu"; + case ARMISD::VSHLIMM: return "ARMISD::VSHLIMM"; + case ARMISD::VSHRsIMM: return "ARMISD::VSHRsIMM"; + case ARMISD::VSHRuIMM: return "ARMISD::VSHRuIMM"; + case ARMISD::VRSHRsIMM: return "ARMISD::VRSHRsIMM"; + case ARMISD::VRSHRuIMM: return "ARMISD::VRSHRuIMM"; + case ARMISD::VRSHRNIMM: return "ARMISD::VRSHRNIMM"; + case ARMISD::VQSHLsIMM: return "ARMISD::VQSHLsIMM"; + case ARMISD::VQSHLuIMM: return "ARMISD::VQSHLuIMM"; + case ARMISD::VQSHLsuIMM: return "ARMISD::VQSHLsuIMM"; + case ARMISD::VQSHRNsIMM: return "ARMISD::VQSHRNsIMM"; + case ARMISD::VQSHRNuIMM: return "ARMISD::VQSHRNuIMM"; + case ARMISD::VQSHRNsuIMM: return "ARMISD::VQSHRNsuIMM"; + case ARMISD::VQRSHRNsIMM: return "ARMISD::VQRSHRNsIMM"; + case ARMISD::VQRSHRNuIMM: return "ARMISD::VQRSHRNuIMM"; + case ARMISD::VQRSHRNsuIMM: return "ARMISD::VQRSHRNsuIMM"; + case ARMISD::VSLIIMM: return "ARMISD::VSLIIMM"; + case ARMISD::VSRIIMM: return "ARMISD::VSRIIMM"; case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; @@ -1410,6 +1588,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; + case ARMISD::WLS: return "ARMISD::WLS"; } return nullptr; } @@ -1423,11 +1602,14 @@ EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, /// getRegClassFor - Return the register class that should be used for the /// specified value type. -const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { +const TargetRegisterClass * +ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { + (void)isDivergent; // Map v4i64 to QQ registers but do not make the type legal. Similarly map // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to - // load / store 4 to 8 consecutive D registers. - if (Subtarget->hasNEON()) { + // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive + // MVE Q registers. + if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { if (VT == MVT::v4i64) return &ARM::QQPRRegClass; if (VT == MVT::v8i64) @@ -1590,8 +1772,6 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, // Calling Convention Implementation //===----------------------------------------------------------------------===// -#include "ARMGenCallingConv.inc" - /// getEffectiveCallingConv - Get the effective calling convention, taking into /// account presence of floating point hardware and calling convention /// limitations, such as support for variadic functions. @@ -1613,7 +1793,7 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, case CallingConv::C: if (!Subtarget->isAAPCS_ABI()) return CallingConv::ARM_APCS; - else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && + else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && getTargetMachine().Options.FloatABIType == FloatABI::Hard && !isVarArg) return CallingConv::ARM_AAPCS_VFP; @@ -1622,10 +1802,11 @@ ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, case CallingConv::Fast: case CallingConv::CXX_FAST_TLS: if (!Subtarget->isAAPCS_ABI()) { - if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) + if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg) return CallingConv::Fast; return CallingConv::ARM_APCS; - } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) + } else if (Subtarget->hasVFP2Base() && + !Subtarget->isThumb1Only() && !isVarArg) return CallingConv::ARM_AAPCS_VFP; else return CallingConv::ARM_AAPCS; @@ -1807,29 +1988,42 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); - bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); - bool isThisReturn = false; - bool isSibCall = false; + bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); + bool isThisReturn = false; auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); + bool PreferIndirect = false; // Disable tail calls if they're not supported. if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") isTailCall = false; + if (isa<GlobalAddressSDNode>(Callee)) { + // If we're optimizing for minimum size and the function is called three or + // more times in this block, we can improve codesize by calling indirectly + // as BLXr has a 16-bit encoding. + auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); + if (CLI.CS) { + auto *BB = CLI.CS.getParent(); + PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() && + count_if(GV->users(), [&BB](const User *U) { + return isa<Instruction>(U) && + cast<Instruction>(U)->getParent() == BB; + }) > 2; + } + } if (isTailCall) { // Check if it's really possible to do a tail call. - isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, - isVarArg, isStructRet, MF.getFunction().hasStructRetAttr(), - Outs, OutVals, Ins, DAG); + isTailCall = IsEligibleForTailCallOptimization( + Callee, CallConv, isVarArg, isStructRet, + MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG, + PreferIndirect); if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); // We don't support GuaranteedTailCallOpt for ARM, only automatically // detected sibcalls. - if (isTailCall) { + if (isTailCall) ++NumTailCalls; - isSibCall = true; - } } // Analyze operands of the call, assigning locations to each operand. @@ -1841,14 +2035,14 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); - // For tail calls, memory operands are available in our caller's stack. - if (isSibCall) + if (isTailCall) { + // For tail calls, memory operands are available in our caller's stack. NumBytes = 0; - - // Adjust the stack pointer for the new arguments... - // These operations are automatically eliminated by the prolog/epilog pass - if (!isSibCall) + } else { + // Adjust the stack pointer for the new arguments... + // These operations are automatically eliminated by the prolog/epilog pass Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); + } SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); @@ -1970,7 +2164,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops)); } - } else if (!isSibCall) { + } else if (!isTailCall) { assert(VA.isMemLoc()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, @@ -1984,32 +2178,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; - // Tail call byval lowering might overwrite argument registers so in case of - // tail call optimization the copies to registers are lowered later. - if (!isTailCall) - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, InFlag); - InFlag = Chain.getValue(1); - } - - // For tail calls lower the arguments to the 'real' stack slot. - if (isTailCall) { - // Force all the incoming stack arguments to be loaded from the stack - // before any new outgoing arguments are stored to the stack, because the - // outgoing stack slots may alias the incoming argument stack slots, and - // the alias isn't otherwise explicit. This is slightly more conservative - // than necessary, because it means that each store effectively depends - // on every argument instead of just those arguments it would clobber. - - // Do not flag preceding copytoreg stuff together with the following stuff. - InFlag = SDValue(); - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, - RegsToPass[i].second, InFlag); - InFlag = Chain.getValue(1); - } - InFlag = SDValue(); + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, + RegsToPass[i].second, InFlag); + InFlag = Chain.getValue(1); } // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every @@ -2064,17 +2236,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } } else if (isa<GlobalAddressSDNode>(Callee)) { - // If we're optimizing for minimum size and the function is called three or - // more times in this block, we can improve codesize by calling indirectly - // as BLXr has a 16-bit encoding. - auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal(); - auto *BB = CLI.CS.getParent(); - bool PreferIndirect = - Subtarget->isThumb() && MF.getFunction().optForMinSize() && - count_if(GV->users(), [&BB](const User *U) { - return isa<Instruction>(U) && cast<Instruction>(U)->getParent() == BB; - }) > 2; - if (!PreferIndirect) { isDirect = true; bool isDef = GV->isStrongDefinitionForLinker(); @@ -2098,7 +2259,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned TargetFlags = GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG; - Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, + Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*offset=*/0, TargetFlags); if (GV->hasDLLImportStorageClass()) Callee = @@ -2142,7 +2303,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CallOpc = ARMISD::CALL_NOLINK; else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && // Emit regular call when code size is the priority - !MF.getFunction().optForMinSize()) + !Subtarget->hasMinSize()) // "mov lr, pc; b _foo" to avoid confusing the RSP CallOpc = ARMISD::CALL_NOLINK; else @@ -2306,28 +2467,25 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. -bool -ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, - CallingConv::ID CalleeCC, - bool isVarArg, - bool isCalleeStructRet, - bool isCallerStructRet, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - SelectionDAG& DAG) const { +bool ARMTargetLowering::IsEligibleForTailCallOptimization( + SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, + bool isCalleeStructRet, bool isCallerStructRet, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG, + const bool isIndirect) const { MachineFunction &MF = DAG.getMachineFunction(); const Function &CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF.getCallingConv(); assert(Subtarget->supportsTailCall()); - // Tail calls to function pointers cannot be optimized for Thumb1 if the args + // Indirect tail calls cannot be optimized for Thumb1 if the args // to the call take up r0-r3. The reason is that there are no legal registers // left to hold the pointer to the function to be called. if (Subtarget->isThumb1Only() && Outs.size() >= 4 && - !isa<GlobalAddressSDNode>(Callee.getNode())) - return false; + (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) + return false; // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. @@ -2756,7 +2914,7 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, auto M = const_cast<Module*>(DAG.getMachineFunction(). getFunction().getParent()); auto GV = new GlobalVariable( - *M, T, /*isConst=*/true, GlobalVariable::InternalLinkage, C, + *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + Twine(AFI->createPICLabelUId()) @@ -3225,7 +3383,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, } else if (Subtarget->isRWPI() && !IsRO) { // SB-relative. SDValue RelAddr; - if (Subtarget->useMovt(DAG.getMachineFunction())) { + if (Subtarget->useMovt()) { ++NumMovwMovt; SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); @@ -3245,7 +3403,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, // If we have T2 ops, we can materialize the address directly via movt/movw // pair. This is always cheaper. - if (Subtarget->useMovt(DAG.getMachineFunction())) { + if (Subtarget->useMovt()) { ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes. @@ -3268,7 +3426,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SDLoc dl(Op); const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); - if (Subtarget->useMovt(DAG.getMachineFunction())) + if (Subtarget->useMovt()) ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register @@ -3288,7 +3446,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); - assert(Subtarget->useMovt(DAG.getMachineFunction()) && + assert(Subtarget->useMovt() && "Windows on ARM expects to use movw/movt"); assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && "ROPI/RWPI not currently supported for Windows"); @@ -3309,7 +3467,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes. Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, - DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0, + DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0, TargetFlags)); if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB)) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, @@ -3615,7 +3773,8 @@ void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, // argument passed via stack. int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(), - CCInfo.getNextStackOffset(), 4); + CCInfo.getNextStackOffset(), + std::max(4U, TotalArgRegsSaveSize)); AFI->setVarArgsFrameIndex(FrameIndex); } @@ -3891,6 +4050,22 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, } ARMCC::CondCodes CondCode = IntCCToARMCC(CC); + + // If the RHS is a constant zero then the V (overflow) flag will never be + // set. This can allow us to simplify GE to PL or LT to MI, which can be + // simpler for other passes (like the peephole optimiser) to deal with. + if (isNullConstant(RHS)) { + switch (CondCode) { + default: break; + case ARMCC::GE: + CondCode = ARMCC::PL; + break; + case ARMCC::LT: + CondCode = ARMCC::MI; + break; + } + } + ARMISD::NodeType CompareType; switch (CondCode) { default: @@ -3910,7 +4085,7 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl, bool InvalidOnQNaN) const { - assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64); + assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64); SDValue Cmp; SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32); if (!isFloatingPointZero(RHS)) @@ -4175,18 +4350,18 @@ static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, // Start by selecting the GE condition code for opcodes that return true for // 'equality' if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || - CC == ISD::SETULE) + CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE) CondCode = ARMCC::GE; // and GT for opcodes that return false for 'equality'. else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || - CC == ISD::SETULT) + CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT) CondCode = ARMCC::GT; // Since we are constrained to GE/GT, if the opcode contains 'less', we need // to swap the compare operands. if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || - CC == ISD::SETULT) + CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT) swpCmpOps = true; // Both GT and GE are ordered comparisons, and return false for 'unordered'. @@ -4212,8 +4387,9 @@ static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, } // 'unordered or not equal' is 'anything but equal', so use the EQ condition - // code and swap the VSEL operands. - if (CC == ISD::SETUNE) { + // code and swap the VSEL operands. Also do this if we don't care about the + // unordered case. + if (CC == ISD::SETUNE || CC == ISD::SETNE) { CondCode = ARMCC::EQ; swpVselOps = true; } @@ -4222,7 +4398,7 @@ static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal, SDValue ARMcc, SDValue CCR, SDValue Cmp, SelectionDAG &DAG) const { - if (Subtarget->isFPOnlySP() && VT == MVT::f64) { + if (!Subtarget->hasFP64() && VT == MVT::f64) { FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), FalseVal); TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, @@ -4428,6 +4604,16 @@ static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, return false; } +bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const { + if (VT == MVT::f32) + return !Subtarget->hasVFP2Base(); + if (VT == MVT::f64) + return !Subtarget->hasFP64(); + if (VT == MVT::f16) + return !Subtarget->hasFullFP16(); + return false; +} + SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc dl(Op); @@ -4471,9 +4657,9 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue TrueVal = Op.getOperand(2); SDValue FalseVal = Op.getOperand(3); - if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { - DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, - dl); + if (isUnsupportedFloatingType(LHS.getValueType())) { + DAG.getTargetLoweringInfo().softenSetCCOperands( + DAG, LHS.getValueType(), LHS, RHS, CC, dl); // If softenSetCCOperands only returned one value, we should compare it to // zero. @@ -4494,8 +4680,9 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // inverting the compare condition, swapping 'less' and 'greater') and // sometimes need to swap the operands to the VSEL (which inverts the // condition in the sense of firing whenever the previous condition didn't) - if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || - TrueVal.getValueType() == MVT::f64)) { + if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 || + TrueVal.getValueType() == MVT::f32 || + TrueVal.getValueType() == MVT::f64)) { ARMCC::CondCodes CondCode = IntCCToARMCC(CC); if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || CondCode == ARMCC::VC || CondCode == ARMCC::NE) { @@ -4507,6 +4694,9 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { SDValue ARMcc; SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); + // Choose GE over PL, which vsel does now support + if (cast<ConstantSDNode>(ARMcc)->getZExtValue() == ARMCC::PL) + ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32); return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); } @@ -4514,12 +4704,15 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { bool InvalidOnQNaN; FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); - // Normalize the fp compare. If RHS is zero we keep it there so we match - // CMPFPw0 instead of CMPFP. - if (Subtarget->hasFPARMv8() && !isFloatingPointZero(RHS) && - (TrueVal.getValueType() == MVT::f16 || - TrueVal.getValueType() == MVT::f32 || - TrueVal.getValueType() == MVT::f64)) { + // Normalize the fp compare. If RHS is zero we prefer to keep it there so we + // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we + // must use VSEL (limited condition codes), due to not having conditional f16 + // moves. + if (Subtarget->hasFPARMv8Base() && + !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) && + (TrueVal.getValueType() == MVT::f16 || + TrueVal.getValueType() == MVT::f32 || + TrueVal.getValueType() == MVT::f64)) { bool swpCmpOps = false; bool swpVselOps = false; checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); @@ -4708,9 +4901,9 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Dest = Op.getOperand(4); SDLoc dl(Op); - if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { - DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, - dl); + if (isUnsupportedFloatingType(LHS.getValueType())) { + DAG.getTargetLoweringInfo().softenSetCCOperands( + DAG, LHS.getValueType(), LHS, RHS, CC, dl); // If softenSetCCOperands only returned one value, we should compare it to // zero. @@ -4855,7 +5048,7 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorFP_TO_INT(Op, DAG); - if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) { + if (isUnsupportedFloatingType(Op.getOperand(0).getValueType())) { RTLIB::Libcall LC; if (Op.getOpcode() == ISD::FP_TO_SINT) LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), @@ -4919,7 +5112,7 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorINT_TO_FP(Op, DAG); - if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) { + if (isUnsupportedFloatingType(VT)) { RTLIB::Libcall LC; if (Op.getOpcode() == ISD::SINT_TO_FP) LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), @@ -4952,7 +5145,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; if (VT == MVT::f64) - Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT, + Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), DAG.getConstant(32, dl, MVT::i32)); else /*if (VT == MVT::f32)*/ @@ -4960,11 +5153,11 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { if (SrcVT == MVT::f32) { Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); if (VT == MVT::f64) - Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, + Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), DAG.getConstant(32, dl, MVT::i32)); } else if (VT == MVT::f32) - Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, + Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64, DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), DAG.getConstant(32, dl, MVT::i32)); Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); @@ -5469,40 +5662,100 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, return Res; } +/// Getvshiftimm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift operation, where all the elements of the +/// build_vector must have the same constant integer value. +static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { + // Ignore bit_converts. + while (Op.getOpcode() == ISD::BITCAST) + Op = Op.getOperand(0); + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (!BVN || + !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, + ElementBits) || + SplatBitSize > ElementBits) + return false; + Cnt = SplatBits.getSExtValue(); + return true; +} + +/// isVShiftLImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift left operation. That value must be in the range: +/// 0 <= Value < ElementBits for a left shift; or +/// 0 <= Value <= ElementBits for a long left shift. +static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + int64_t ElementBits = VT.getScalarSizeInBits(); + if (!getVShiftImm(Op, ElementBits, Cnt)) + return false; + return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); +} + +/// isVShiftRImm - Check if this is a valid build_vector for the immediate +/// operand of a vector shift right operation. For a shift opcode, the value +/// is positive, but for an intrinsic the value count must be negative. The +/// absolute value must be in the range: +/// 1 <= |Value| <= ElementBits for a right shift; or +/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. +static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, + int64_t &Cnt) { + assert(VT.isVector() && "vector shift count is not a vector type"); + int64_t ElementBits = VT.getScalarSizeInBits(); + if (!getVShiftImm(Op, ElementBits, Cnt)) + return false; + if (!isIntrinsic) + return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); + if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) { + Cnt = -Cnt; + return true; + } + return false; +} + static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); SDLoc dl(N); + int64_t Cnt; if (!VT.isVector()) return SDValue(); - // Lower vector shifts on NEON to use VSHL. - assert(ST->hasNEON() && "unexpected vector shift"); + // We essentially have two forms here. Shift by an immediate and shift by a + // vector register (there are also shift by a gpr, but that is just handled + // with a tablegen pattern). We cannot easily match shift by an immediate in + // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM. + // For shifting by a vector, we don't have VSHR, only VSHL (which can be + // signed or unsigned, and a negative shift indicates a shift right). + if (N->getOpcode() == ISD::SHL) { + if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) + return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), + DAG.getConstant(Cnt, dl, MVT::i32)); + return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0), + N->getOperand(1)); + } - // Left shifts translate directly to the vshiftu intrinsic. - if (N->getOpcode() == ISD::SHL) - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl, - MVT::i32), - N->getOperand(0), N->getOperand(1)); + assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && + "unexpected vector shift opcode"); - assert((N->getOpcode() == ISD::SRA || - N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); + if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { + unsigned VShiftOpc = + (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); + return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), + DAG.getConstant(Cnt, dl, MVT::i32)); + } - // NEON uses the same intrinsics for both left and right shifts. For - // right shifts, the shift amounts are negative, so negate the vector of - // shift amounts. + // Other right shifts we don't have operations for (we use a shift left by a + // negative number). EVT ShiftVT = N->getOperand(1).getValueType(); - SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, - getZeroVector(ShiftVT, DAG, dl), - N->getOperand(1)); - Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? - Intrinsic::arm_neon_vshifts : - Intrinsic::arm_neon_vshiftu); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(vshiftInt, dl, MVT::i32), - N->getOperand(0), NegatedCount); + SDValue NegatedCount = DAG.getNode( + ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); + unsigned VShiftOpc = + (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu); + return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount); } static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, @@ -5514,15 +5767,59 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, if (VT != MVT::i64) return SDValue(); - assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && + assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA || + N->getOpcode() == ISD::SHL) && "Unknown shift to lower!"); + unsigned ShOpc = N->getOpcode(); + if (ST->hasMVEIntegerOps()) { + SDValue ShAmt = N->getOperand(1); + unsigned ShPartsOpc = ARMISD::LSLL; + ConstantSDNode *Con = dyn_cast<ConstantSDNode>(ShAmt); + + // If the shift amount is greater than 32 then do the default optimisation + if (Con && Con->getZExtValue() > 32) + return SDValue(); + + // Extract the lower 32 bits of the shift amount if it's an i64 + if (ShAmt->getValueType(0) == MVT::i64) + ShAmt = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ShAmt, + DAG.getConstant(0, dl, MVT::i32)); + + if (ShOpc == ISD::SRL) { + if (!Con) + // There is no t2LSRLr instruction so negate and perform an lsll if the + // shift amount is in a register, emulating a right shift. + ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, + DAG.getConstant(0, dl, MVT::i32), ShAmt); + else + // Else generate an lsrl on the immediate shift amount + ShPartsOpc = ARMISD::LSRL; + } else if (ShOpc == ISD::SRA) + ShPartsOpc = ARMISD::ASRL; + + // Lower 32 bits of the destination/source + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), + DAG.getConstant(0, dl, MVT::i32)); + // Upper 32 bits of the destination/source + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), + DAG.getConstant(1, dl, MVT::i32)); + + // Generate the shift operation as computed above + Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi, + ShAmt); + // The upper 32 bits come from the second return value of lsll + Hi = SDValue(Lo.getNode(), 1); + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); + } + // We only lower SRA, SRL of 1 here, all others use generic lowering. - if (!isOneConstant(N->getOperand(1))) + if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL) return SDValue(); // If we are in thumb mode, we don't have RRX. - if (ST->isThumb1Only()) return SDValue(); + if (ST->isThumb1Only()) + return SDValue(); // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), @@ -5731,7 +6028,7 @@ static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { } /// isNEONModifiedImm - Check if the specified splat value corresponds to a -/// valid vector constant for a NEON instruction with a "modified immediate" +/// valid vector constant for a NEON or MVE instruction with a "modified immediate" /// operand (e.g., VMOV). If so, return the encoded value. static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, @@ -5817,6 +6114,10 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, break; } + // cmode == 0b1101 is not supported for MVE VMVN + if (type == MVEVMVNModImm) + return SDValue(); + if ((SplatBits & ~0xffffff) == 0 && ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { // Value = 0x00nnffff: Op=x, Cmode=1101. @@ -5902,12 +6203,12 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, } } - if (!ST->hasVFP3()) + if (!ST->hasVFP3Base()) return SDValue(); // Use the default (constant pool) lowering for double constants when we have // an SP-only FPU - if (IsDouble && Subtarget->isFPOnlySP()) + if (IsDouble && !Subtarget->hasFP64()) return SDValue(); // Try splatting with a VMOV.f32... @@ -6383,13 +6684,15 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (SplatUndef.isAllOnesValue()) return DAG.getUNDEF(VT); - if (SplatBitSize <= 64) { + if ((ST->hasNEON() && SplatBitSize <= 64) || + (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) { // Check if an immediate VMOV works. EVT VmovVT; SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, VT.is128BitVector(), VMOVModImm); + if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); @@ -6397,10 +6700,10 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // Try an immediate VMVN. uint64_t NegatedImm = (~SplatBits).getZExtValue(); - Val = isNEONModifiedImm(NegatedImm, - SplatUndef.getZExtValue(), SplatBitSize, - DAG, dl, VmovVT, VT.is128BitVector(), - VMVNModImm); + Val = isNEONModifiedImm( + NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, + DAG, dl, VmovVT, VT.is128BitVector(), + ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); @@ -6515,10 +6818,13 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, } if (VT.getVectorElementType().isFloatingPoint()) { SmallVector<SDValue, 8> Ops; + MVT FVT = VT.getVectorElementType().getSimpleVT(); + assert(FVT == MVT::f32 || FVT == MVT::f16); + MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16; for (unsigned i = 0; i < NumElts; ++i) - Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, + Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT, Op.getOperand(i))); - EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts); SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); Val = LowerBUILD_VECTOR(Val, DAG, ST); if (Val.getNode()) @@ -6544,7 +6850,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, return shuffle; } - if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { + if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { // If we haven't found an efficient lowering, try splitting a 128-bit vector // into two 64-bit vectors; we might discover a better way to lower it. SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts); @@ -6799,6 +7105,38 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } +enum ShuffleOpCodes { + OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> + OP_VREV, + OP_VDUP0, + OP_VDUP1, + OP_VDUP2, + OP_VDUP3, + OP_VEXT1, + OP_VEXT2, + OP_VEXT3, + OP_VUZPL, // VUZP, left result + OP_VUZPR, // VUZP, right result + OP_VZIPL, // VZIP, left result + OP_VZIPR, // VZIP, right result + OP_VTRNL, // VTRN, left result + OP_VTRNR // VTRN, right result +}; + +static bool isLegalMVEShuffleOp(unsigned PFEntry) { + unsigned OpNum = (PFEntry >> 26) & 0x0F; + switch (OpNum) { + case OP_COPY: + case OP_VREV: + case OP_VDUP0: + case OP_VDUP1: + case OP_VDUP2: + case OP_VDUP3: + return true; + } + return false; +} + /// isShuffleMaskLegal - Targets can use this to indicate that they only /// support *some* VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values @@ -6820,7 +7158,7 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); - if (Cost <= 4) + if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry))) return true; } @@ -6828,15 +7166,22 @@ bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { unsigned Imm, WhichResult; unsigned EltSize = VT.getScalarSizeInBits(); - return (EltSize >= 32 || - ShuffleVectorSDNode::isSplatMask(&M[0], VT) || - isVREVMask(M, VT, 64) || - isVREVMask(M, VT, 32) || - isVREVMask(M, VT, 16) || - isVEXTMask(M, VT, ReverseVEXT, Imm) || - isVTBLMask(M, VT) || - isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) || - ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT))); + if (EltSize >= 32 || + ShuffleVectorSDNode::isSplatMask(&M[0], VT) || + isVREVMask(M, VT, 64) || + isVREVMask(M, VT, 32) || + isVREVMask(M, VT, 16)) + return true; + else if (Subtarget->hasNEON() && + (isVEXTMask(M, VT, ReverseVEXT, Imm) || + isVTBLMask(M, VT) || + isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF))) + return true; + else if (Subtarget->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && + isReverseMask(M, VT)) + return true; + else + return false; } /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit @@ -6848,24 +7193,6 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); - enum { - OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> - OP_VREV, - OP_VDUP0, - OP_VDUP1, - OP_VDUP2, - OP_VDUP3, - OP_VEXT1, - OP_VEXT2, - OP_VEXT3, - OP_VUZPL, // VUZP, left result - OP_VUZPR, // VUZP, right result - OP_VZIPL, // VZIP, left result - OP_VZIPR, // VZIP, right result - OP_VTRNL, // VTRN, left result - OP_VTRNR // VTRN, right result - }; - if (OpNum == OP_COPY) { if (LHSID == (1*9+2)*9+3) return LHS; assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); @@ -6955,7 +7282,8 @@ static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, DAG.getConstant(ExtractNum, DL, MVT::i32)); } -static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); SDLoc dl(Op); @@ -6999,9 +7327,9 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { DAG.getConstant(Lane, dl, MVT::i32)); } - bool ReverseVEXT; - unsigned Imm; - if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { + bool ReverseVEXT = false; + unsigned Imm = 0; + if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { if (ReverseVEXT) std::swap(V1, V2); return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, @@ -7015,7 +7343,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { if (isVREVMask(ShuffleMask, VT, 16)) return DAG.getNode(ARMISD::VREV16, dl, VT, V1); - if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { + if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, DAG.getConstant(Imm, dl, MVT::i32)); } @@ -7025,14 +7353,16 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // source operands and with masks corresponding to both results of one of // these operations, DAG memoization will ensure that a single node is // used for both shuffles. - unsigned WhichResult; - bool isV_UNDEF; - if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( - ShuffleMask, VT, WhichResult, isV_UNDEF)) { - if (isV_UNDEF) - V2 = V1; - return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) - .getValue(WhichResult); + unsigned WhichResult = 0; + bool isV_UNDEF = false; + if (ST->hasNEON()) { + if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( + ShuffleMask, VT, WhichResult, isV_UNDEF)) { + if (isV_UNDEF) + V2 = V1; + return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) + .getValue(WhichResult); + } } // Also check for these shuffles through CONCAT_VECTORS: we canonicalize @@ -7050,7 +7380,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { // -> // concat(VZIP(v1, v2):0, :1) // - if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { + if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { SDValue SubV1 = V1->getOperand(0); SDValue SubV2 = V1->getOperand(1); EVT SubVT = SubV1.getValueType(); @@ -7092,8 +7422,18 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); - if (Cost <= 4) - return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + if (Cost <= 4) { + if (ST->hasNEON()) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + else if (isLegalMVEShuffleOp(PFEntry)) { + unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); + unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); + unsigned PFEntryLHS = PerfectShuffleTable[LHSID]; + unsigned PFEntryRHS = PerfectShuffleTable[RHSID]; + if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS)) + return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); + } + } } // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. @@ -7118,22 +7458,50 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::BITCAST, dl, VT, Val); } - if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) + if (ST->hasNEON() && (VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); - if (VT == MVT::v8i8) + if (ST->hasNEON() && VT == MVT::v8i8) if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) return NewOp; return SDValue(); } -static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { +SDValue ARMTargetLowering:: +LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { // INSERT_VECTOR_ELT is legal only for immediate indexes. SDValue Lane = Op.getOperand(2); if (!isa<ConstantSDNode>(Lane)) return SDValue(); + SDValue Elt = Op.getOperand(1); + EVT EltVT = Elt.getValueType(); + if (getTypeAction(*DAG.getContext(), EltVT) == + TargetLowering::TypePromoteFloat) { + // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32, + // but the type system will try to do that if we don't intervene. + // Reinterpret any such vector-element insertion as one with the + // corresponding integer types. + + SDLoc dl(Op); + + EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits()); + assert(getTypeAction(*DAG.getContext(), IEltVT) != + TargetLowering::TypePromoteFloat); + + SDValue VecIn = Op.getOperand(0); + EVT VecVT = VecIn.getValueType(); + EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT, + VecVT.getVectorNumElements()); + + SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt); + SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn); + SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT, + IVecIn, IElt, Lane); + return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut); + } + return Op; } @@ -7809,8 +8177,7 @@ ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, return SDValue(); const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget()); - const auto &MF = DAG.getMachineFunction(); - const bool MinSize = MF.getFunction().optForMinSize(); + const bool MinSize = ST.hasMinSize(); const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode() : ST.hasDivideInARMMode(); @@ -8063,7 +8430,7 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); - case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); + case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); @@ -8149,6 +8516,7 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, break; case ISD::SRL: case ISD::SRA: + case ISD::SHL: Res = Expand64BitShift(N, DAG, Subtarget); break; case ISD::SREM: @@ -8175,6 +8543,10 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, return; case ISD::INTRINSIC_WO_CHAIN: return ReplaceLongIntrinsic(N, Results, DAG); + case ISD::ABS: + lowerABS(N, Results, DAG); + return ; + } if (Res.getNode()) Results.push_back(Res); @@ -8980,7 +9352,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, // Load an immediate to varEnd. unsigned varEnd = MRI.createVirtualRegister(TRC); - if (Subtarget->useMovt(*MF)) { + if (Subtarget->useMovt()) { unsigned Vtmp = varEnd; if ((LoopSize & 0xFFFF0000) != 0) Vtmp = MRI.createVirtualRegister(TRC); @@ -9003,18 +9375,23 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI, if (Align == 0) Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); + MachineMemOperand *CPMMO = + MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), + MachineMemOperand::MOLoad, 4, 4); if (IsThumb) BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) .addReg(varEnd, RegState::Define) .addConstantPoolIndex(Idx) - .add(predOps(ARMCC::AL)); + .add(predOps(ARMCC::AL)) + .addMemOperand(CPMMO); else BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) .addReg(varEnd, RegState::Define) .addConstantPoolIndex(Idx) .addImm(0) - .add(predOps(ARMCC::AL)); + .add(predOps(ARMCC::AL)) + .addMemOperand(CPMMO); } BB->addSuccessor(loopMBB); @@ -9262,7 +9639,8 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, .add(MI.getOperand(2)) // Rn .add(MI.getOperand(3)) // PredImm .add(MI.getOperand(4)) // PredReg - .add(MI.getOperand(0)); // Rt + .add(MI.getOperand(0)) // Rt + .cloneMemRefs(MI); MI.eraseFromParent(); return BB; } @@ -10372,6 +10750,22 @@ static SDValue PerformAddeSubeCombine(SDNode *N, return SDValue(); } +static SDValue PerformABSCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + SDValue res; + SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + if (TLI.isOperationLegal(N->getOpcode(), N->getValueType(0))) + return SDValue(); + + if (!TLI.expandABS(N, res, DAG)) + return SDValue(); + + return res; +} + /// PerformADDECombine - Target-specific dag combine transform from /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL @@ -10419,11 +10813,28 @@ ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, if (Level == BeforeLegalizeTypes) return true; - if (Subtarget->isThumb() && Subtarget->isThumb1Only()) + if (N->getOpcode() != ISD::SHL) return true; - if (N->getOpcode() != ISD::SHL) + if (Subtarget->isThumb1Only()) { + // Avoid making expensive immediates by commuting shifts. (This logic + // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted + // for free.) + if (N->getOpcode() != ISD::SHL) + return true; + SDValue N1 = N->getOperand(0); + if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND && + N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR) + return true; + if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) { + if (Const->getAPIntValue().ult(256)) + return false; + if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) && + Const->getAPIntValue().sgt(-256)) + return false; + } return true; + } // Turn off commute-with-shift transform after legalization, so it doesn't // conflict with PerformSHLSimplify. (We could try to detect when @@ -10432,9 +10843,8 @@ ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, return false; } -bool -ARMTargetLowering::shouldFoldShiftPairToMask(const SDNode *N, - CombineLevel Level) const { +bool ARMTargetLowering::shouldFoldConstantShiftPairToMask( + const SDNode *N, CombineLevel Level) const { if (!Subtarget->isThumb1Only()) return true; @@ -10444,6 +10854,15 @@ ARMTargetLowering::shouldFoldShiftPairToMask(const SDNode *N, return false; } +bool ARMTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const { + if (!Subtarget->hasNEON()) { + if (Subtarget->isThumb1Only()) + return VT.getScalarSizeInBits() <= 32; + return true; + } + return VT.isScalarInteger(); +} + static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { @@ -10830,7 +11249,7 @@ static SDValue PerformANDCombine(SDNode *N, APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; - if (BVN && + if (BVN && Subtarget->hasNEON() && BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { EVT VbicVT; @@ -11308,7 +11727,7 @@ static SDValue PerformVMOVRRDCombine(SDNode *N, const ARMSubtarget *Subtarget) { // vmovrrd(vmovdrr x, y) -> x,y SDValue InDouble = N->getOperand(0); - if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP()) + if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64()) return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); // vmovrrd(load f64) -> (load i32), (load i32) @@ -11329,9 +11748,11 @@ static SDValue PerformVMOVRRDCombine(SDNode *N, SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, DAG.getConstant(4, DL, MVT::i32)); - SDValue NewLD2 = DAG.getLoad( - MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(), - std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags()); + + SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr, + LD->getPointerInfo().getWithOffset(4), + std::min(4U, LD->getAlignment()), + LD->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); if (DCI.DAG.getDataLayout().isBigEndian()) @@ -11922,10 +12343,14 @@ static SDValue PerformVDUPLANECombine(SDNode *N, /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. static SDValue PerformVDUPCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; SDValue Op = N->getOperand(0); + if (!Subtarget->hasNEON()) + return SDValue(); + // Match VDUP(LOAD) -> VLD1DUP. // We match this pattern here rather than waiting for isel because the // transform is only legal for unindexed loads. @@ -12132,11 +12557,11 @@ static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); uint32_t IntBits = IntTy.getSizeInBits(); unsigned NumLanes = Op.getValueType().getVectorNumElements(); - if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { + if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { // These instructions only exist converting from f32 to i32. We can handle // smaller integers by generating an extra truncate, but larger ones would - // be lossy. We also can't handle more then 4 lanes, since these intructions - // only support v2i32/v4i32 types. + // be lossy. We also can't handle anything other than 2 or 4 lanes, since + // these intructions only support v2i32/v4i32 types. return SDValue(); } @@ -12190,11 +12615,11 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); uint32_t IntBits = IntTy.getSizeInBits(); unsigned NumLanes = Op.getValueType().getVectorNumElements(); - if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { + if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) { // These instructions only exist converting from i32 to f32. We can handle // smaller integers by generating an extra extend, but larger ones would - // be lossy. We also can't handle more then 4 lanes, since these intructions - // only support v2i32/v4i32 types. + // be lossy. We also can't handle anything other than 2 or 4 lanes, since + // these intructions only support v2i32/v4i32 types. return SDValue(); } @@ -12220,58 +12645,6 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, ConvInput, DAG.getConstant(C, dl, MVT::i32)); } -/// Getvshiftimm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift operation, where all the elements of the -/// build_vector must have the same constant integer value. -static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { - // Ignore bit_converts. - while (Op.getOpcode() == ISD::BITCAST) - Op = Op.getOperand(0); - BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); - APInt SplatBits, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, - HasAnyUndefs, ElementBits) || - SplatBitSize > ElementBits) - return false; - Cnt = SplatBits.getSExtValue(); - return true; -} - -/// isVShiftLImm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift left operation. That value must be in the range: -/// 0 <= Value < ElementBits for a left shift; or -/// 0 <= Value <= ElementBits for a long left shift. -static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { - assert(VT.isVector() && "vector shift count is not a vector type"); - int64_t ElementBits = VT.getScalarSizeInBits(); - if (! getVShiftImm(Op, ElementBits, Cnt)) - return false; - return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); -} - -/// isVShiftRImm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift right operation. For a shift opcode, the value -/// is positive, but for an intrinsic the value count must be negative. The -/// absolute value must be in the range: -/// 1 <= |Value| <= ElementBits for a right shift; or -/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. -static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, - int64_t &Cnt) { - assert(VT.isVector() && "vector shift count is not a vector type"); - int64_t ElementBits = VT.getScalarSizeInBits(); - if (! getVShiftImm(Op, ElementBits, Cnt)) - return false; - if (!isIntrinsic) - return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); - if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { - Cnt = -Cnt; - return true; - } - return false; -} - /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); @@ -12307,12 +12680,12 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { case Intrinsic::arm_neon_vshifts: case Intrinsic::arm_neon_vshiftu: if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { - VShiftOpc = ARMISD::VSHL; + VShiftOpc = ARMISD::VSHLIMM; break; } if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { - VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? - ARMISD::VSHRs : ARMISD::VSHRu); + VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM + : ARMISD::VSHRuIMM); break; } return SDValue(); @@ -12357,29 +12730,41 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { // Opcode already set above. break; case Intrinsic::arm_neon_vrshifts: - VShiftOpc = ARMISD::VRSHRs; break; + VShiftOpc = ARMISD::VRSHRsIMM; + break; case Intrinsic::arm_neon_vrshiftu: - VShiftOpc = ARMISD::VRSHRu; break; + VShiftOpc = ARMISD::VRSHRuIMM; + break; case Intrinsic::arm_neon_vrshiftn: - VShiftOpc = ARMISD::VRSHRN; break; + VShiftOpc = ARMISD::VRSHRNIMM; + break; case Intrinsic::arm_neon_vqshifts: - VShiftOpc = ARMISD::VQSHLs; break; + VShiftOpc = ARMISD::VQSHLsIMM; + break; case Intrinsic::arm_neon_vqshiftu: - VShiftOpc = ARMISD::VQSHLu; break; + VShiftOpc = ARMISD::VQSHLuIMM; + break; case Intrinsic::arm_neon_vqshiftsu: - VShiftOpc = ARMISD::VQSHLsu; break; + VShiftOpc = ARMISD::VQSHLsuIMM; + break; case Intrinsic::arm_neon_vqshiftns: - VShiftOpc = ARMISD::VQSHRNs; break; + VShiftOpc = ARMISD::VQSHRNsIMM; + break; case Intrinsic::arm_neon_vqshiftnu: - VShiftOpc = ARMISD::VQSHRNu; break; + VShiftOpc = ARMISD::VQSHRNuIMM; + break; case Intrinsic::arm_neon_vqshiftnsu: - VShiftOpc = ARMISD::VQSHRNsu; break; + VShiftOpc = ARMISD::VQSHRNsuIMM; + break; case Intrinsic::arm_neon_vqrshiftns: - VShiftOpc = ARMISD::VQRSHRNs; break; + VShiftOpc = ARMISD::VQRSHRNsIMM; + break; case Intrinsic::arm_neon_vqrshiftnu: - VShiftOpc = ARMISD::VQRSHRNu; break; + VShiftOpc = ARMISD::VQRSHRNuIMM; + break; case Intrinsic::arm_neon_vqrshiftnsu: - VShiftOpc = ARMISD::VQRSHRNsu; break; + VShiftOpc = ARMISD::VQRSHRNsuIMM; + break; } SDLoc dl(N); @@ -12393,9 +12778,9 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { unsigned VShiftOpc = 0; if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) - VShiftOpc = ARMISD::VSLI; + VShiftOpc = ARMISD::VSLIIMM; else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) - VShiftOpc = ARMISD::VSRI; + VShiftOpc = ARMISD::VSRIIMM; else { llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); } @@ -12420,8 +12805,10 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { /// combining instead of DAG legalizing because the build_vectors for 64-bit /// vector element shift counts are generally not legal, and it is hard to see /// their values after they get legalized to loads from a constant pool. -static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, +static SDValue PerformShiftCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { + SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high @@ -12436,12 +12823,47 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, } } + if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 && + N->getOperand(0)->getOpcode() == ISD::AND && + N->getOperand(0)->hasOneUse()) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't + // usually show up because instcombine prefers to canonicalize it to + // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come + // out of GEP lowering in some cases. + SDValue N0 = N->getOperand(0); + ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!ShiftAmtNode) + return SDValue(); + uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue()); + ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1)); + if (!AndMaskNode) + return SDValue(); + uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue()); + // Don't transform uxtb/uxth. + if (AndMask == 255 || AndMask == 65535) + return SDValue(); + if (isMask_32(AndMask)) { + uint32_t MaskedBits = countLeadingZeros(AndMask); + if (MaskedBits > ShiftAmt) { + SDLoc DL(N); + SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), + DAG.getConstant(MaskedBits, DL, MVT::i32)); + return DAG.getNode( + ISD::SRL, DL, MVT::i32, SHL, + DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32)); + } + } + } + // Nothing to be done for scalar shifts. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!VT.isVector() || !TLI.isTypeLegal(VT)) return SDValue(); + if (ST->hasMVEIntegerOps() && VT == MVT::v2i64) + return SDValue(); - assert(ST->hasNEON() && "unexpected vector shift"); int64_t Cnt; switch (N->getOpcode()) { @@ -12450,7 +12872,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, case ISD::SHL: if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { SDLoc dl(N); - return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0), + return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0), DAG.getConstant(Cnt, dl, MVT::i32)); } break; @@ -12458,8 +12880,8 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, case ISD::SRA: case ISD::SRL: if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { - unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? - ARMISD::VSHRs : ARMISD::VSHRu); + unsigned VShiftOpc = + (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM); SDLoc dl(N); return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), DAG.getConstant(Cnt, dl, MVT::i32)); @@ -12606,6 +13028,45 @@ SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &D return V; } +static SDValue PerformHWLoopCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *ST) { + // Look for (brcond (xor test.set.loop.iterations, -1) + SDValue CC = N->getOperand(1); + unsigned Opc = CC->getOpcode(); + SDValue Int; + + if ((Opc == ISD::XOR || Opc == ISD::SETCC) && + (CC->getOperand(0)->getOpcode() == ISD::INTRINSIC_W_CHAIN)) { + + assert((isa<ConstantSDNode>(CC->getOperand(1)) && + cast<ConstantSDNode>(CC->getOperand(1))->isOne()) && + "Expected to compare against 1"); + + Int = CC->getOperand(0); + } else if (CC->getOpcode() == ISD::INTRINSIC_W_CHAIN) + Int = CC; + else + return SDValue(); + + unsigned IntOp = cast<ConstantSDNode>(Int.getOperand(1))->getZExtValue(); + if (IntOp != Intrinsic::test_set_loop_iterations) + return SDValue(); + + SDLoc dl(Int); + SDValue Chain = N->getOperand(0); + SDValue Elements = Int.getOperand(2); + SDValue ExitBlock = N->getOperand(2); + + // TODO: Once we start supporting tail predication, we can add another + // operand to WLS for the number of elements processed in a vector loop. + + SDValue Ops[] = { Chain, Elements, ExitBlock }; + SDValue Res = DCI.DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); + DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); + return Res; +} + /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. SDValue ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { @@ -12779,15 +13240,21 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { // On Thumb1, the DAG above may be further combined if z is a power of 2 // (z == 2 ^ K). // CMOV (SUBS x, y), z, !=, (SUBS x, y):1 -> - // merge t3, t4 - // where t1 = (SUBCARRY (SUB x, y), z, 0) - // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) - // t3 = if K != 0 then (SHL t2:0, K) else t2:0 - // t4 = (SUB 1, t2:1) [ we want a carry, not a borrow ] + // t1 = (USUBO (SUB x, y), 1) + // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) + // Result = if K != 0 then (SHL t2:0, K) else t2:0 + // + // This also handles the special case of comparing against zero; it's + // essentially, the same pattern, except there's no SUBS: + // CMOV x, z, !=, (CMPZ x, 0) -> + // t1 = (USUBO x, 1) + // t2 = (SUBCARRY x, t1:0, t1:1) + // Result = if K != 0 then (SHL t2:0, K) else t2:0 const APInt *TrueConst; if (Subtarget->isThumb1Only() && CC == ARMCC::NE && - (FalseVal.getOpcode() == ARMISD::SUBS) && - (FalseVal.getOperand(0) == LHS) && (FalseVal.getOperand(1) == RHS) && + ((FalseVal.getOpcode() == ARMISD::SUBS && + FalseVal.getOperand(0) == LHS && FalseVal.getOperand(1) == RHS) || + (FalseVal == LHS && isNullConstant(RHS))) && (TrueConst = isPowerOf2Constant(TrueVal))) { SDVTList VTs = DAG.getVTList(VT, MVT::i32); unsigned ShiftAmount = TrueConst->logBase2(); @@ -12795,10 +13262,6 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { TrueVal = DAG.getConstant(1, dl, VT); SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); - // Make it a carry, not a borrow. - SDValue Carry = DAG.getNode( - ISD::SUB, dl, VT, DAG.getConstant(1, dl, MVT::i32), Res.getValue(1)); - Res = DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Res, Carry); if (ShiftAmount) Res = DAG.getNode(ISD::SHL, dl, VT, Res, @@ -12826,6 +13289,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: break; + case ISD::ABS: return PerformABSCombine(N, DCI, Subtarget); case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); @@ -12834,6 +13298,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::OR: return PerformORCombine(N, DCI, Subtarget); case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); + case ISD::BRCOND: return PerformHWLoopCombine(N, DCI, Subtarget); case ARMISD::ADDC: case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); @@ -12845,7 +13310,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); - case ARMISD::VDUP: return PerformVDUPCombine(N, DCI); + case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI.DAG, Subtarget); @@ -12854,7 +13319,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); case ISD::SHL: case ISD::SRA: - case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget); + case ISD::SRL: + return PerformShiftCombine(N, DCI, Subtarget); case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); @@ -12957,9 +13423,9 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); } -bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned, - unsigned, +bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, + unsigned Alignment, + MachineMemOperand::Flags, bool *Fast) const { // Depends what it gets converted into if the type is weird. if (!VT.isSimple()) @@ -12967,23 +13433,18 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); + auto Ty = VT.getSimpleVT().SimpleTy; - switch (VT.getSimpleVT().SimpleTy) { - default: - return false; - case MVT::i8: - case MVT::i16: - case MVT::i32: { + if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) { // Unaligned access can use (for example) LRDB, LRDH, LDR if (AllowsUnaligned) { if (Fast) *Fast = Subtarget->hasV7Ops(); return true; } - return false; } - case MVT::f64: - case MVT::v2f64: { + + if (Ty == MVT::f64 || Ty == MVT::v2f64) { // For any little-endian targets with neon, we can support unaligned ld/st // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. // A big-endian target may also explicitly support unaligned accesses @@ -12992,9 +13453,54 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, *Fast = true; return true; } - return false; } + + if (!Subtarget->hasMVEIntegerOps()) + return false; + if (Ty != MVT::v16i8 && Ty != MVT::v8i16 && Ty != MVT::v8f16 && + Ty != MVT::v4i32 && Ty != MVT::v4f32 && Ty != MVT::v2i64 && + Ty != MVT::v2f64 && + // These are for truncated stores + Ty != MVT::v4i8 && Ty != MVT::v8i8 && Ty != MVT::v4i16) + return false; + + if (Subtarget->isLittle()) { + // In little-endian MVE, the store instructions VSTRB.U8, + // VSTRH.U16 and VSTRW.U32 all store the vector register in + // exactly the same format, and differ only in the range of + // their immediate offset field and the required alignment. + // + // In particular, VSTRB.U8 can store a vector at byte alignment. + // So at this stage we can simply say that loads/stores of all + // 128-bit wide vector types are permitted at any alignment, + // because we know at least _one_ instruction can manage that. + // + // Later on we might find that some of those loads are better + // generated as VLDRW.U32 if alignment permits, to take + // advantage of the larger immediate range. But for the moment, + // all that matters is that if we don't lower the load then + // _some_ instruction can handle it. + if (Fast) + *Fast = true; + return true; + } else { + // In big-endian MVE, those instructions aren't so similar + // after all, because they reorder the bytes of the vector + // differently. So this time we can only store a particular + // kind of vector if its alignment is at least the element + // type. And we can't store vectors of i64 or f64 at all + // without having to do some postprocessing, because there's + // no VSTRD.U64. + if (Ty == MVT::v16i8 || + ((Ty == MVT::v8i16 || Ty == MVT::v8f16) && Alignment >= 2) || + ((Ty == MVT::v4i32 || Ty == MVT::v4f32) && Alignment >= 4)) { + if (Fast) + *Fast = true; + return true; + } } + + return false; } static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, @@ -13003,24 +13509,24 @@ static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, (DstAlign == 0 || DstAlign % AlignCheck == 0)); } -EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, - unsigned DstAlign, unsigned SrcAlign, - bool IsMemset, bool ZeroMemset, - bool MemcpyStrSrc, - MachineFunction &MF) const { - const Function &F = MF.getFunction(); - +EVT ARMTargetLowering::getOptimalMemOpType( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { // See if we can use NEON instructions for this... if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && - !F.hasFnAttribute(Attribute::NoImplicitFloat)) { + !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) { bool Fast; if (Size >= 16 && (memOpAlign(SrcAlign, DstAlign, 16) || - (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) { + (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, + MachineMemOperand::MONone, &Fast) && + Fast))) { return MVT::v2f64; } else if (Size >= 8 && (memOpAlign(SrcAlign, DstAlign, 8) || - (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) && + (allowsMisalignedMemoryAccesses( + MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) && Fast))) { return MVT::f64; } @@ -13089,6 +13595,46 @@ bool ARMTargetLowering::isFNegFree(EVT VT) const { return false; } +/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth +/// of the vector elements. +static bool areExtractExts(Value *Ext1, Value *Ext2) { + auto areExtDoubled = [](Instruction *Ext) { + return Ext->getType()->getScalarSizeInBits() == + 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits(); + }; + + if (!match(Ext1, m_ZExtOrSExt(m_Value())) || + !match(Ext2, m_ZExtOrSExt(m_Value())) || + !areExtDoubled(cast<Instruction>(Ext1)) || + !areExtDoubled(cast<Instruction>(Ext2))) + return false; + + return true; +} + +/// Check if sinking \p I's operands to I's basic block is profitable, because +/// the operands can be folded into a target instruction, e.g. +/// sext/zext can be folded into vsubl. +bool ARMTargetLowering::shouldSinkOperands(Instruction *I, + SmallVectorImpl<Use *> &Ops) const { + if (!Subtarget->hasNEON() || !I->getType()->isVectorTy()) + return false; + + switch (I->getOpcode()) { + case Instruction::Sub: + case Instruction::Add: { + if (!areExtractExts(I->getOperand(0), I->getOperand(1))) + return false; + Ops.push_back(&I->getOperandUse(0)); + Ops.push_back(&I->getOperandUse(1)); + return true; + } + default: + return false; + } + return false; +} + bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { EVT VT = ExtVal.getValueType(); @@ -13105,7 +13651,7 @@ bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { SDNode *U = *ExtVal->use_begin(); if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || - U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL)) + U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM)) return false; return true; @@ -13142,7 +13688,6 @@ static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { unsigned Scale = 1; switch (VT.getSimpleVT().SimpleTy) { - default: return false; case MVT::i1: case MVT::i8: // Scale == 1; @@ -13151,7 +13696,8 @@ static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { // Scale == 2; Scale = 2; break; - case MVT::i32: + default: + // On thumb1 we load most things (i32, i64, floats, etc) with a LDR // Scale == 4; Scale = 4; break; @@ -13159,38 +13705,58 @@ static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { if ((V & (Scale - 1)) != 0) return false; - V /= Scale; - return V == (V & ((1LL << 5) - 1)); + return isUInt<5>(V / Scale); } static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget) { - bool isNeg = false; + if (!VT.isInteger() && !VT.isFloatingPoint()) + return false; + if (VT.isVector() && Subtarget->hasNEON()) + return false; + if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() && + !Subtarget->hasMVEFloatOps()) + return false; + + bool IsNeg = false; if (V < 0) { - isNeg = true; - V = - V; + IsNeg = true; + V = -V; } - switch (VT.getSimpleVT().SimpleTy) { - default: return false; - case MVT::i1: - case MVT::i8: - case MVT::i16: - case MVT::i32: - // + imm12 or - imm8 - if (isNeg) - return V == (V & ((1LL << 8) - 1)); - return V == (V & ((1LL << 12) - 1)); - case MVT::f32: - case MVT::f64: - // Same as ARM mode. FIXME: NEON? - if (!Subtarget->hasVFP2()) - return false; - if ((V & 3) != 0) + unsigned NumBytes = std::max(VT.getSizeInBits() / 8, 1U); + + // MVE: size * imm7 + if (VT.isVector() && Subtarget->hasMVEIntegerOps()) { + switch (VT.getSimpleVT().getVectorElementType().SimpleTy) { + case MVT::i32: + case MVT::f32: + return isShiftedUInt<7,2>(V); + case MVT::i16: + case MVT::f16: + return isShiftedUInt<7,1>(V); + case MVT::i8: + return isUInt<7>(V); + default: return false; - V >>= 2; - return V == (V & ((1LL << 8) - 1)); + } } + + // half VLDR: 2 * imm8 + if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16()) + return isShiftedUInt<8, 1>(V); + // VLDR and LDRD: 4 * imm8 + if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8) + return isShiftedUInt<8, 2>(V); + + if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) { + // + imm12 or - imm8 + if (IsNeg) + return isUInt<8>(V); + return isUInt<12>(V); + } + + return false; } /// isLegalAddressImmediate - Return true if the integer value can be used @@ -13218,18 +13784,15 @@ static bool isLegalAddressImmediate(int64_t V, EVT VT, case MVT::i8: case MVT::i32: // +- imm12 - return V == (V & ((1LL << 12) - 1)); + return isUInt<12>(V); case MVT::i16: // +- imm8 - return V == (V & ((1LL << 8) - 1)); + return isUInt<8>(V); case MVT::f32: case MVT::f64: - if (!Subtarget->hasVFP2()) // FIXME: NEON? + if (!Subtarget->hasVFP2Base()) // FIXME: NEON? return false; - if ((V & 3) != 0) - return false; - V >>= 2; - return V == (V & ((1LL << 8) - 1)); + return isShiftedUInt<8, 2>(V); } } @@ -13649,13 +14212,13 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, EVT VT = Op.getValueType(); const unsigned DstSz = VT.getScalarSizeInBits(); const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits(); + (void)SrcSz; assert(SrcSz == Known.getBitWidth()); assert(DstSz > SrcSz); if (Op.getOpcode() == ARMISD::VGETLANEs) Known = Known.sext(DstSz); else { - Known = Known.zext(DstSz); - Known.Zero.setBitsFrom(SrcSz); + Known = Known.zext(DstSz, true /* extended bits are known zero */); } assert(DstSz == Known.getBitWidth()); break; @@ -13790,7 +14353,7 @@ const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { // Although we are correct (we are free to emit anything, without // constraints), we might break use cases that would expect us to be more // efficient and emit something else. - if (!Subtarget->hasVFP2()) + if (!Subtarget->hasVFP2Base()) return "r"; if (ConstraintVT.isFloatingPoint()) return "w"; @@ -13822,6 +14385,7 @@ ARMTargetLowering::getConstraintType(StringRef Constraint) const { } else if (Constraint.size() == 2) { switch (Constraint[0]) { default: break; + case 'T': return C_RegisterClass; // All 'U+' constraints are addresses. case 'U': return C_Memory; } @@ -13867,7 +14431,8 @@ using RCPair = std::pair<unsigned, const TargetRegisterClass *>; RCPair ARMTargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { - if (Constraint.size() == 1) { + switch (Constraint.size()) { + case 1: // GCC ARM Constraint Letters switch (Constraint[0]) { case 'l': // Low regs or general regs. @@ -13913,7 +14478,25 @@ RCPair ARMTargetLowering::getRegForInlineAsmConstraint( return RCPair(0U, &ARM::QPR_VFP2RegClass); break; } + break; + + case 2: + if (Constraint[0] == 'T') { + switch (Constraint[1]) { + default: + break; + case 'e': + return RCPair(0U, &ARM::tGPREvenRegClass); + case 'o': + return RCPair(0U, &ARM::tGPROddRegClass); + } + } + break; + + default: + break; } + if (StringRef("{cc}").equals_lower(Constraint)) return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); @@ -14272,28 +14855,107 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const } SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { - assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() && + SDValue SrcVal = Op.getOperand(0); + const unsigned DstSz = Op.getValueType().getSizeInBits(); + const unsigned SrcSz = SrcVal.getValueType().getSizeInBits(); + assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 && "Unexpected type for custom-lowering FP_EXTEND"); + assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && + "With both FP DP and 16, any FP conversion is legal!"); + + assert(!(DstSz == 32 && Subtarget->hasFP16()) && + "With FP16, 16 to 32 conversion is legal!"); + + // Either we are converting from 16 -> 64, without FP16 and/or + // FP.double-precision or without Armv8-fp. So we must do it in two + // steps. + // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32 + // without FP16. So we must do a function call. + SDLoc Loc(Op); RTLIB::Libcall LC; - LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); + if (SrcSz == 16) { + // Instruction from 16 -> 32 + if (Subtarget->hasFP16()) + SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, SrcVal); + // Lib call from 16 -> 32 + else { + LC = RTLIB::getFPEXT(MVT::f16, MVT::f32); + assert(LC != RTLIB::UNKNOWN_LIBCALL && + "Unexpected type for custom-lowering FP_EXTEND"); + SrcVal = + makeLibCall(DAG, LC, MVT::f32, SrcVal, /*isSigned*/ false, Loc).first; + } + } - SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, - SDLoc(Op)).first; + if (DstSz != 64) + return SrcVal; + // For sure now SrcVal is 32 bits + if (Subtarget->hasFP64()) // Instruction from 32 -> 64 + return DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f64, SrcVal); + + LC = RTLIB::getFPEXT(MVT::f32, MVT::f64); + assert(LC != RTLIB::UNKNOWN_LIBCALL && + "Unexpected type for custom-lowering FP_EXTEND"); + return makeLibCall(DAG, LC, MVT::f64, SrcVal, /*isSigned*/ false, Loc).first; } SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { - assert(Op.getOperand(0).getValueType() == MVT::f64 && - Subtarget->isFPOnlySP() && + SDValue SrcVal = Op.getOperand(0); + EVT SrcVT = SrcVal.getValueType(); + EVT DstVT = Op.getValueType(); + const unsigned DstSz = Op.getValueType().getSizeInBits(); + const unsigned SrcSz = SrcVT.getSizeInBits(); + (void)DstSz; + assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 && "Unexpected type for custom-lowering FP_ROUND"); - RTLIB::Libcall LC; - LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); + assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) && + "With both FP DP and 16, any FP conversion is legal!"); - SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, - SDLoc(Op)).first; + SDLoc Loc(Op); + + // Instruction from 32 -> 16 if hasFP16 is valid + if (SrcSz == 32 && Subtarget->hasFP16()) + return Op; + + // Lib call from 32 -> 16 / 64 -> [32, 16] + RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT); + assert(LC != RTLIB::UNKNOWN_LIBCALL && + "Unexpected type for custom-lowering FP_ROUND"); + return makeLibCall(DAG, LC, DstVT, SrcVal, /*isSigned*/ false, Loc).first; +} + +void ARMTargetLowering::lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG) const { + assert(N->getValueType(0) == MVT::i64 && "Unexpected type (!= i64) on ABS."); + MVT HalfT = MVT::i32; + SDLoc dl(N); + SDValue Hi, Lo, Tmp; + + if (!isOperationLegalOrCustom(ISD::ADDCARRY, HalfT) || + !isOperationLegalOrCustom(ISD::UADDO, HalfT)) + return ; + + unsigned OpTypeBits = HalfT.getScalarSizeInBits(); + SDVTList VTList = DAG.getVTList(HalfT, MVT::i1); + + Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), + DAG.getConstant(0, dl, HalfT)); + Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(0), + DAG.getConstant(1, dl, HalfT)); + + Tmp = DAG.getNode(ISD::SRA, dl, HalfT, Hi, + DAG.getConstant(OpTypeBits - 1, dl, + getShiftAmountTy(HalfT, DAG.getDataLayout()))); + Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo); + Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi, + SDValue(Lo.getNode(), 1)); + Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi); + Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo); + + Results.push_back(Lo); + Results.push_back(Hi); } bool @@ -14314,14 +14976,15 @@ bool ARM::isBitFieldInvertedMask(unsigned v) { /// isFPImmLegal - Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. -bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { - if (!Subtarget->hasVFP3()) +bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const { + if (!Subtarget->hasVFP3Base()) return false; if (VT == MVT::f16 && Subtarget->hasFullFP16()) return ARM_AM::getFP16Imm(Imm) != -1; if (VT == MVT::f32) return ARM_AM::getFP32Imm(Imm) != -1; - if (VT == MVT::f64 && !Subtarget->isFPOnlySP()) + if (VT == MVT::f64 && Subtarget->hasFP64()) return ARM_AM::getFP64Imm(Imm) != -1; return false; } @@ -14590,6 +15253,9 @@ ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { // and up to 64 bits on the non-M profiles TargetLowering::AtomicExpansionKind ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { + if (AI->isFloatingPointOperation()) + return AtomicExpansionKind::CmpXChg; + unsigned Size = AI->getType()->getPrimitiveSizeInBits(); bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) @@ -14621,6 +15287,36 @@ bool ARMTargetLowering::useLoadStackGuardNode() const { return Subtarget->isTargetMachO(); } +void ARMTargetLowering::insertSSPDeclarations(Module &M) const { + if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) + return TargetLowering::insertSSPDeclarations(M); + + // MSVC CRT has a global variable holding security cookie. + M.getOrInsertGlobal("__security_cookie", + Type::getInt8PtrTy(M.getContext())); + + // MSVC CRT has a function to validate security cookie. + FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( + "__security_check_cookie", Type::getVoidTy(M.getContext()), + Type::getInt8PtrTy(M.getContext())); + if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) + F->addAttribute(1, Attribute::AttrKind::InReg); +} + +Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const { + // MSVC CRT has a global variable holding security cookie. + if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) + return M.getGlobalVariable("__security_cookie"); + return TargetLowering::getSDagStackGuard(M); +} + +Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const { + // MSVC CRT has a function to validate security cookie. + if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) + return M.getFunction("__security_check_cookie"); + return TargetLowering::getSSPStackGuardCheck(M); +} + bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const { // If we do not have NEON, vector types are not natively supported. @@ -14658,6 +15354,10 @@ bool ARMTargetLowering::isCheapToSpeculateCtlz() const { return Subtarget->hasV6T2Ops(); } +bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { + return !Subtarget->hasMinSize(); +} + Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); @@ -14850,8 +15550,9 @@ bool ARMTargetLowering::lowerInterleavedLoad( // If we're generating more than one load, compute the base address of // subsequent loads as an offset from the previous. if (LoadCount > 0) - BaseAddr = Builder.CreateConstGEP1_32( - BaseAddr, VecTy->getVectorNumElements() * Factor); + BaseAddr = + Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr, + VecTy->getVectorNumElements() * Factor); SmallVector<Value *, 2> Ops; Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); @@ -14990,7 +15691,8 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, // If we generating more than one store, we compute the base address of // subsequent stores as an offset from the previous. if (StoreCount > 0) - BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor); + BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(), + BaseAddr, LaneLen * Factor); SmallVector<Value *, 6> Ops; Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); |