diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2016-07-23 20:41:05 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2016-07-23 20:41:05 +0000 |
commit | 01095a5d43bbfde13731688ddcf6048ebb8b7721 (patch) | |
tree | 4def12e759965de927d963ac65840d663ef9d1ea /lib/Target/X86/X86InstrInfo.cpp | |
parent | f0f4822ed4b66e3579e92a89f368f8fb860e218e (diff) |
Diffstat (limited to 'lib/Target/X86/X86InstrInfo.cpp')
-rw-r--r-- | lib/Target/X86/X86InstrInfo.cpp | 2229 |
1 files changed, 1508 insertions, 721 deletions
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 246804e34289a..1672b3855b798 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -18,11 +18,13 @@ #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" @@ -36,7 +38,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" -#include <limits> using namespace llvm; @@ -57,6 +58,17 @@ static cl::opt<bool> ReMatPICStubLoad("remat-pic-stub-load", cl::desc("Re-materialize load from stub in PIC mode"), cl::init(false), cl::Hidden); +static cl::opt<unsigned> +PartialRegUpdateClearance("partial-reg-update-clearance", + cl::desc("Clearance between two register writes " + "for inserting XOR to avoid partial " + "register update"), + cl::init(64), cl::Hidden); +static cl::opt<unsigned> +UndefRegClearance("undef-reg-clearance", + cl::desc("How many idle instructions we would like before " + "certain undef register reads"), + cl::init(64), cl::Hidden); enum { // Select which memory operand is being unfolded. @@ -105,7 +117,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) : X86::ADJCALLSTACKDOWN32), (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32), - X86::CATCHRET), + X86::CATCHRET, + (STI.is64Bit() ? X86::RETQ : X86::RETL)), Subtarget(STI), RI(STI.getTargetTriple()) { static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { @@ -804,50 +817,54 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::TZMSK64rr, X86::TZMSK64rm, 0 }, // AVX-512 foldable instructions - { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 }, - { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 }, - { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 }, - { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 }, - { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 }, - { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 }, - { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 }, - { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 }, - { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 }, - { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 }, - { X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 }, - { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 }, - { X86::VPABSDZrr, X86::VPABSDZrm, 0 }, - { X86::VPABSQZrr, X86::VPABSQZrm, 0 }, - { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, - { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, + { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 }, + { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 }, + { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 }, + { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 }, + { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 }, + { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 }, + { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 }, + { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 }, + { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 }, + { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 }, + { X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 }, + { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 }, + { X86::VPABSDZrr, X86::VPABSDZrm, 0 }, + { X86::VPABSQZrr, X86::VPABSQZrm, 0 }, + { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, + { X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm, TB_NO_REVERSE }, + { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, + { X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm, TB_NO_REVERSE }, // AVX-512 foldable instructions (256-bit versions) - { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 }, - { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 }, - { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 }, - { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 }, - { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 }, - { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 }, - { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 }, - { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 }, - { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 }, - { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 }, - { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, - { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, - - // AVX-512 foldable instructions (256-bit versions) - { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 }, - { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 }, - { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 }, - { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 }, - { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 }, - { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 }, - { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 }, - { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 }, - { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 }, - { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 }, - { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, + { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 }, + { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 }, + { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 }, + { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 }, + { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 }, + { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 }, + { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 }, + { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 }, + { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 }, + { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 }, + { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ256r_s, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, + { X86::VBROADCASTSDZ256r_s, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, + // AVX-512 foldable instructions (128-bit versions) + { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 }, + { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 }, + { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 }, + { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 }, + { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 }, + { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 }, + { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 }, + { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 }, + { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 }, + { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 }, + { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, + { X86::VBROADCASTSSZ128r_s, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, // F16C foldable instructions { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 }, { X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 }, @@ -998,6 +1015,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 }, { X86::MINSSrr, X86::MINSSrm, 0 }, { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 }, + { X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE }, { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, @@ -1023,7 +1041,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 }, { X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 }, { X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 }, - { X86::PALIGNR128rr, X86::PALIGNR128rm, TB_ALIGN_16 }, + { X86::PALIGNRrri, X86::PALIGNRrmi, TB_ALIGN_16 }, { X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 }, { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 }, { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 }, @@ -1073,9 +1091,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PORrr, X86::PORrm, TB_ALIGN_16 }, { X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 }, { X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 }, - { X86::PSIGNBrr, X86::PSIGNBrm, TB_ALIGN_16 }, - { X86::PSIGNWrr, X86::PSIGNWrm, TB_ALIGN_16 }, - { X86::PSIGNDrr, X86::PSIGNDrm, TB_ALIGN_16 }, + { X86::PSIGNBrr128, X86::PSIGNBrm128, TB_ALIGN_16 }, + { X86::PSIGNWrr128, X86::PSIGNWrm128, TB_ALIGN_16 }, + { X86::PSIGNDrr128, X86::PSIGNDrm128, TB_ALIGN_16 }, { X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 }, { X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 }, { X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 }, @@ -1298,6 +1316,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 }, { X86::VMINSSrr, X86::VMINSSrm, 0 }, { X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 }, + { X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE }, { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 }, { X86::VMULPDrr, X86::VMULPDrm, 0 }, { X86::VMULPSrr, X86::VMULPSrm, 0 }, @@ -1319,7 +1338,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPADDUSBrr, X86::VPADDUSBrm, 0 }, { X86::VPADDUSWrr, X86::VPADDUSWrm, 0 }, { X86::VPADDWrr, X86::VPADDWrm, 0 }, - { X86::VPALIGNR128rr, X86::VPALIGNR128rm, 0 }, + { X86::VPALIGNRrri, X86::VPALIGNRrmi, 0 }, { X86::VPANDNrr, X86::VPANDNrm, 0 }, { X86::VPANDrr, X86::VPANDrm, 0 }, { X86::VPAVGBrr, X86::VPAVGBrm, 0 }, @@ -1371,9 +1390,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPORrr, X86::VPORrm, 0 }, { X86::VPSADBWrr, X86::VPSADBWrm, 0 }, { X86::VPSHUFBrr, X86::VPSHUFBrm, 0 }, - { X86::VPSIGNBrr, X86::VPSIGNBrm, 0 }, - { X86::VPSIGNWrr, X86::VPSIGNWrm, 0 }, - { X86::VPSIGNDrr, X86::VPSIGNDrm, 0 }, + { X86::VPSIGNBrr128, X86::VPSIGNBrm128, 0 }, + { X86::VPSIGNWrr128, X86::VPSIGNWrm128, 0 }, + { X86::VPSIGNDrr128, X86::VPSIGNDrm128, 0 }, { X86::VPSLLDrr, X86::VPSLLDrm, 0 }, { X86::VPSLLQrr, X86::VPSLLQrm, 0 }, { X86::VPSLLWrr, X86::VPSLLWrm, 0 }, @@ -1475,7 +1494,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 }, { X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 }, { X86::VPADDWYrr, X86::VPADDWYrm, 0 }, - { X86::VPALIGNR256rr, X86::VPALIGNR256rm, 0 }, + { X86::VPALIGNRYrri, X86::VPALIGNRYrmi, 0 }, { X86::VPANDNYrr, X86::VPANDNYrm, 0 }, { X86::VPANDYrr, X86::VPANDYrm, 0 }, { X86::VPAVGBYrr, X86::VPAVGBYrm, 0 }, @@ -1526,9 +1545,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPORYrr, X86::VPORYrm, 0 }, { X86::VPSADBWYrr, X86::VPSADBWYrm, 0 }, { X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 }, - { X86::VPSIGNBYrr, X86::VPSIGNBYrm, 0 }, - { X86::VPSIGNWYrr, X86::VPSIGNWYrm, 0 }, - { X86::VPSIGNDYrr, X86::VPSIGNDYrm, 0 }, + { X86::VPSIGNBYrr256, X86::VPSIGNBYrm256, 0 }, + { X86::VPSIGNWYrr256, X86::VPSIGNWYrm256, 0 }, + { X86::VPSIGNDYrr256, X86::VPSIGNDYrm256, 0 }, { X86::VPSLLDYrr, X86::VPSLLDYrm, 0 }, { X86::VPSLLQYrr, X86::VPSLLQYrm, 0 }, { X86::VPSLLWYrr, X86::VPSLLWYrm, 0 }, @@ -1540,6 +1559,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 }, { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 }, { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 }, + { X86::VPSRAVD_Intrr, X86::VPSRAVD_Intrm, 0 }, + { X86::VPSRAVD_IntYrr, X86::VPSRAVD_IntYrm, 0 }, { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 }, { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 }, { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 }, @@ -1600,8 +1621,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_NONE }, // XOP foldable instructions - { X86::VPCMOVrr, X86::VPCMOVmr, 0 }, - { X86::VPCMOVrrY, X86::VPCMOVmrY, 0 }, + { X86::VPCMOVrrr, X86::VPCMOVrmr, 0 }, + { X86::VPCMOVrrrY, X86::VPCMOVrmrY, 0 }, { X86::VPCOMBri, X86::VPCOMBmi, 0 }, { X86::VPCOMDri, X86::VPCOMDmi, 0 }, { X86::VPCOMQri, X86::VPCOMQmi, 0 }, @@ -1626,7 +1647,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMACSWWrr, X86::VPMACSWWrm, 0 }, { X86::VPMADCSSWDrr, X86::VPMADCSSWDrm, 0 }, { X86::VPMADCSWDrr, X86::VPMADCSWDrm, 0 }, - { X86::VPPERMrr, X86::VPPERMmr, 0 }, + { X86::VPPERMrrr, X86::VPPERMrmr, 0 }, { X86::VPROTBrr, X86::VPROTBrm, 0 }, { X86::VPROTDrr, X86::VPROTDrm, 0 }, { X86::VPROTQrr, X86::VPROTQrm, 0 }, @@ -1659,12 +1680,28 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // AVX-512 foldable instructions { X86::VADDPSZrr, X86::VADDPSZrm, 0 }, { X86::VADDPDZrr, X86::VADDPDZrm, 0 }, + { X86::VADDSSZrr, X86::VADDSSZrm, 0 }, + { X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, 0 }, + { X86::VADDSDZrr, X86::VADDSDZrm, 0 }, + { X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, 0 }, { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 }, { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 }, + { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 }, + { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, 0 }, + { X86::VSUBSDZrr, X86::VSUBSDZrm, 0 }, + { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, 0 }, { X86::VMULPSZrr, X86::VMULPSZrm, 0 }, { X86::VMULPDZrr, X86::VMULPDZrm, 0 }, + { X86::VMULSSZrr, X86::VMULSSZrm, 0 }, + { X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, 0 }, + { X86::VMULSDZrr, X86::VMULSDZrm, 0 }, + { X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, 0 }, { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 }, { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 }, + { X86::VDIVSSZrr, X86::VDIVSSZrm, 0 }, + { X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, 0 }, + { X86::VDIVSDZrr, X86::VDIVSDZrm, 0 }, + { X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, 0 }, { X86::VMINPSZrr, X86::VMINPSZrm, 0 }, { X86::VMINPDZrr, X86::VMINPDZrm, 0 }, { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 }, @@ -1902,13 +1939,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_NONE }, // XOP foldable instructions - { X86::VPCMOVrr, X86::VPCMOVrm, 0 }, - { X86::VPCMOVrrY, X86::VPCMOVrmY, 0 }, + { X86::VPCMOVrrr, X86::VPCMOVrrm, 0 }, + { X86::VPCMOVrrrY, X86::VPCMOVrrmY, 0 }, { X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 }, { X86::VPERMIL2PDrrY, X86::VPERMIL2PDrmY, 0 }, { X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 }, { X86::VPERMIL2PSrrY, X86::VPERMIL2PSrmY, 0 }, - { X86::VPPERMrr, X86::VPPERMrm, 0 }, + { X86::VPPERMrrr, X86::VPPERMrrm, 0 }, // AVX-512 VPERMI instructions with 3 source operands. { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 }, @@ -2025,7 +2062,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) void X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable, MemOp2RegOpTableType &M2RTable, - unsigned RegOp, unsigned MemOp, unsigned Flags) { + uint16_t RegOp, uint16_t MemOp, uint16_t Flags) { if ((Flags & TB_NO_FORWARD) == 0) { assert(!R2MTable.count(RegOp) && "Duplicate entry!"); R2MTable[RegOp] = std::make_pair(MemOp, Flags); @@ -2085,19 +2122,19 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, return false; } -int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const { - const MachineFunction *MF = MI->getParent()->getParent(); +int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const { + const MachineFunction *MF = MI.getParent()->getParent(); const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); - if (MI->getOpcode() == getCallFrameSetupOpcode() || - MI->getOpcode() == getCallFrameDestroyOpcode()) { + if (MI.getOpcode() == getCallFrameSetupOpcode() || + MI.getOpcode() == getCallFrameDestroyOpcode()) { unsigned StackAlign = TFI->getStackAlignment(); - int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign * - StackAlign; + int SPAdj = + (MI.getOperand(0).getImm() + StackAlign - 1) / StackAlign * StackAlign; - SPAdj -= MI->getOperand(1).getImm(); + SPAdj -= MI.getOperand(1).getImm(); - if (MI->getOpcode() == getCallFrameSetupOpcode()) + if (MI.getOpcode() == getCallFrameSetupOpcode()) return SPAdj; else return -SPAdj; @@ -2106,8 +2143,8 @@ int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const { // To know whether a call adjusts the stack, we need information // that is bound to the following ADJCALLSTACKUP pseudo. // Look for the next ADJCALLSTACKUP that follows the call. - if (MI->isCall()) { - const MachineBasicBlock* MBB = MI->getParent(); + if (MI.isCall()) { + const MachineBasicBlock *MBB = MI.getParent(); auto I = ++MachineBasicBlock::const_iterator(MI); for (auto E = MBB->end(); I != E; ++I) { if (I->getOpcode() == getCallFrameDestroyOpcode() || @@ -2125,7 +2162,7 @@ int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const { // Currently handle only PUSHes we can reasonably expect to see // in call sequences - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: return 0; case X86::PUSH32i8: @@ -2134,21 +2171,27 @@ int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const { case X86::PUSH32rmr: case X86::PUSHi32: return 4; + case X86::PUSH64i8: + case X86::PUSH64r: + case X86::PUSH64rmm: + case X86::PUSH64rmr: + case X86::PUSH64i32: + return 8; } } /// Return true and the FrameIndex if the specified /// operand and follow operands form a reference to the stack frame. -bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op, +bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op, int &FrameIndex) const { - if (MI->getOperand(Op+X86::AddrBaseReg).isFI() && - MI->getOperand(Op+X86::AddrScaleAmt).isImm() && - MI->getOperand(Op+X86::AddrIndexReg).isReg() && - MI->getOperand(Op+X86::AddrDisp).isImm() && - MI->getOperand(Op+X86::AddrScaleAmt).getImm() == 1 && - MI->getOperand(Op+X86::AddrIndexReg).getReg() == 0 && - MI->getOperand(Op+X86::AddrDisp).getImm() == 0) { - FrameIndex = MI->getOperand(Op+X86::AddrBaseReg).getIndex(); + if (MI.getOperand(Op + X86::AddrBaseReg).isFI() && + MI.getOperand(Op + X86::AddrScaleAmt).isImm() && + MI.getOperand(Op + X86::AddrIndexReg).isReg() && + MI.getOperand(Op + X86::AddrDisp).isImm() && + MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 && + MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 && + MI.getOperand(Op + X86::AddrDisp).getImm() == 0) { + FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex(); return true; } return false; @@ -2166,13 +2209,19 @@ static bool isFrameLoadOpcode(int Opcode) { case X86::MOVSSrm: case X86::MOVSDrm: case X86::MOVAPSrm: + case X86::MOVUPSrm: case X86::MOVAPDrm: + case X86::MOVUPDrm: case X86::MOVDQArm: + case X86::MOVDQUrm: case X86::VMOVSSrm: case X86::VMOVSDrm: case X86::VMOVAPSrm: + case X86::VMOVUPSrm: case X86::VMOVAPDrm: + case X86::VMOVUPDrm: case X86::VMOVDQArm: + case X86::VMOVDQUrm: case X86::VMOVUPSYrm: case X86::VMOVAPSYrm: case X86::VMOVUPDYrm: @@ -2181,8 +2230,42 @@ static bool isFrameLoadOpcode(int Opcode) { case X86::VMOVDQAYrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: + case X86::VMOVSSZrm: + case X86::VMOVSDZrm: case X86::VMOVAPSZrm: + case X86::VMOVAPSZ128rm: + case X86::VMOVAPSZ256rm: case X86::VMOVUPSZrm: + case X86::VMOVUPSZ128rm: + case X86::VMOVUPSZ256rm: + case X86::VMOVAPDZrm: + case X86::VMOVAPDZ128rm: + case X86::VMOVAPDZ256rm: + case X86::VMOVUPDZrm: + case X86::VMOVUPDZ128rm: + case X86::VMOVUPDZ256rm: + case X86::VMOVDQA32Zrm: + case X86::VMOVDQA32Z128rm: + case X86::VMOVDQA32Z256rm: + case X86::VMOVDQU32Zrm: + case X86::VMOVDQU32Z128rm: + case X86::VMOVDQU32Z256rm: + case X86::VMOVDQA64Zrm: + case X86::VMOVDQA64Z128rm: + case X86::VMOVDQA64Z256rm: + case X86::VMOVDQU64Zrm: + case X86::VMOVDQU64Z128rm: + case X86::VMOVDQU64Z256rm: + case X86::VMOVDQU8Zrm: + case X86::VMOVDQU8Z128rm: + case X86::VMOVDQU8Z256rm: + case X86::VMOVDQU16Zrm: + case X86::VMOVDQU16Z128rm: + case X86::VMOVDQU16Z256rm: + case X86::KMOVBkm: + case X86::KMOVWkm: + case X86::KMOVDkm: + case X86::KMOVQkm: return true; } } @@ -2198,40 +2281,80 @@ static bool isFrameStoreOpcode(int Opcode) { case X86::MOVSSmr: case X86::MOVSDmr: case X86::MOVAPSmr: + case X86::MOVUPSmr: case X86::MOVAPDmr: + case X86::MOVUPDmr: case X86::MOVDQAmr: + case X86::MOVDQUmr: case X86::VMOVSSmr: case X86::VMOVSDmr: case X86::VMOVAPSmr: + case X86::VMOVUPSmr: case X86::VMOVAPDmr: + case X86::VMOVUPDmr: case X86::VMOVDQAmr: + case X86::VMOVDQUmr: case X86::VMOVUPSYmr: case X86::VMOVAPSYmr: case X86::VMOVUPDYmr: case X86::VMOVAPDYmr: case X86::VMOVDQUYmr: case X86::VMOVDQAYmr: + case X86::VMOVSSZmr: + case X86::VMOVSDZmr: case X86::VMOVUPSZmr: + case X86::VMOVUPSZ128mr: + case X86::VMOVUPSZ256mr: case X86::VMOVAPSZmr: + case X86::VMOVAPSZ128mr: + case X86::VMOVAPSZ256mr: + case X86::VMOVUPDZmr: + case X86::VMOVUPDZ128mr: + case X86::VMOVUPDZ256mr: + case X86::VMOVAPDZmr: + case X86::VMOVAPDZ128mr: + case X86::VMOVAPDZ256mr: + case X86::VMOVDQA32Zmr: + case X86::VMOVDQA32Z128mr: + case X86::VMOVDQA32Z256mr: + case X86::VMOVDQU32Zmr: + case X86::VMOVDQU32Z128mr: + case X86::VMOVDQU32Z256mr: + case X86::VMOVDQA64Zmr: + case X86::VMOVDQA64Z128mr: + case X86::VMOVDQA64Z256mr: + case X86::VMOVDQU64Zmr: + case X86::VMOVDQU64Z128mr: + case X86::VMOVDQU64Z256mr: + case X86::VMOVDQU8Zmr: + case X86::VMOVDQU8Z128mr: + case X86::VMOVDQU8Z256mr: + case X86::VMOVDQU16Zmr: + case X86::VMOVDQU16Z128mr: + case X86::VMOVDQU16Z256mr: case X86::MMX_MOVD64mr: case X86::MMX_MOVQ64mr: case X86::MMX_MOVNTQmr: + case X86::KMOVBmk: + case X86::KMOVWmk: + case X86::KMOVDmk: + case X86::KMOVQmk: return true; } return false; } -unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, +unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const { - if (isFrameLoadOpcode(MI->getOpcode())) - if (MI->getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex)) - return MI->getOperand(0).getReg(); + if (isFrameLoadOpcode(MI.getOpcode())) + if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex)) + return MI.getOperand(0).getReg(); return 0; } -unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, +unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { - if (isFrameLoadOpcode(MI->getOpcode())) { + if (isFrameLoadOpcode(MI.getOpcode())) { unsigned Reg; if ((Reg = isLoadFromStackSlot(MI, FrameIndex))) return Reg; @@ -2242,18 +2365,18 @@ unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, return 0; } -unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI, +unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const { - if (isFrameStoreOpcode(MI->getOpcode())) - if (MI->getOperand(X86::AddrNumOperands).getSubReg() == 0 && + if (isFrameStoreOpcode(MI.getOpcode())) + if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 && isFrameOperand(MI, 0, FrameIndex)) - return MI->getOperand(X86::AddrNumOperands).getReg(); + return MI.getOperand(X86::AddrNumOperands).getReg(); return 0; } -unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI, +unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const { - if (isFrameStoreOpcode(MI->getOpcode())) { + if (isFrameStoreOpcode(MI.getOpcode())) { unsigned Reg; if ((Reg = isStoreToStackSlot(MI, FrameIndex))) return Reg; @@ -2281,10 +2404,9 @@ static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) { return isPICBase; } -bool -X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, - AliasAnalysis *AA) const { - switch (MI->getOpcode()) { +bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, + AliasAnalysis *AA) const { + switch (MI.getOpcode()) { default: break; case X86::MOV8rm: case X86::MOV16rm: @@ -2345,18 +2467,18 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, case X86::VMOVUPSZ256rm: case X86::VMOVUPSZrm: { // Loads from constant pools are trivially rematerializable. - if (MI->getOperand(1+X86::AddrBaseReg).isReg() && - MI->getOperand(1+X86::AddrScaleAmt).isImm() && - MI->getOperand(1+X86::AddrIndexReg).isReg() && - MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 && - MI->isInvariantLoad(AA)) { - unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg(); + if (MI.getOperand(1 + X86::AddrBaseReg).isReg() && + MI.getOperand(1 + X86::AddrScaleAmt).isImm() && + MI.getOperand(1 + X86::AddrIndexReg).isReg() && + MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 && + MI.isInvariantLoad(AA)) { + unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); if (BaseReg == 0 || BaseReg == X86::RIP) return true; // Allow re-materialization of PIC load. - if (!ReMatPICStubLoad && MI->getOperand(1+X86::AddrDisp).isGlobal()) + if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal()) return false; - const MachineFunction &MF = *MI->getParent()->getParent(); + const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); return regIsPICBase(BaseReg, MRI); } @@ -2365,18 +2487,18 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, case X86::LEA32r: case X86::LEA64r: { - if (MI->getOperand(1+X86::AddrScaleAmt).isImm() && - MI->getOperand(1+X86::AddrIndexReg).isReg() && - MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 && - !MI->getOperand(1+X86::AddrDisp).isReg()) { + if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() && + MI.getOperand(1 + X86::AddrIndexReg).isReg() && + MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 && + !MI.getOperand(1 + X86::AddrDisp).isReg()) { // lea fi#, lea GV, etc. are all rematerializable. - if (!MI->getOperand(1+X86::AddrBaseReg).isReg()) + if (!MI.getOperand(1 + X86::AddrBaseReg).isReg()) return true; - unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg(); + unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg(); if (BaseReg == 0) return true; // Allow re-materialization of lea PICBase + x. - const MachineFunction &MF = *MI->getParent()->getParent(); + const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); return regIsPICBase(BaseReg, MRI); } @@ -2469,10 +2591,10 @@ bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB, void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg, unsigned SubIdx, - const MachineInstr *Orig, + const MachineInstr &Orig, const TargetRegisterInfo &TRI) const { bool ClobbersEFLAGS = false; - for (const MachineOperand &MO : Orig->operands()) { + for (const MachineOperand &MO : Orig.operands()) { if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) { ClobbersEFLAGS = true; break; @@ -2483,7 +2605,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side // effects. int Value; - switch (Orig->getOpcode()) { + switch (Orig.getOpcode()) { case X86::MOV32r0: Value = 0; break; case X86::MOV32r1: Value = 1; break; case X86::MOV32r_1: Value = -1; break; @@ -2491,22 +2613,23 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, llvm_unreachable("Unexpected instruction!"); } - DebugLoc DL = Orig->getDebugLoc(); - BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0)) - .addImm(Value); + const DebugLoc &DL = Orig.getDebugLoc(); + BuildMI(MBB, I, DL, get(X86::MOV32ri)) + .addOperand(Orig.getOperand(0)) + .addImm(Value); } else { - MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); + MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig); MBB.insert(I, MI); } - MachineInstr *NewMI = std::prev(I); - NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI); + MachineInstr &NewMI = *std::prev(I); + NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI); } /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead. -bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr *MI) const { - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); +bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const { + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI.getOperand(i); if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS && !MO.isDead()) { return true; @@ -2516,11 +2639,11 @@ bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr *MI) const { } /// Check whether the shift count for a machine operand is non-zero. -inline static unsigned getTruncatedShiftCount(MachineInstr *MI, +inline static unsigned getTruncatedShiftCount(MachineInstr &MI, unsigned ShiftAmtOperandIdx) { // The shift count is six bits with the REX.W prefix and five bits without. - unsigned ShiftCountMask = (MI->getDesc().TSFlags & X86II::REX_W) ? 63 : 31; - unsigned Imm = MI->getOperand(ShiftAmtOperandIdx).getImm(); + unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31; + unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm(); return Imm & ShiftCountMask; } @@ -2535,11 +2658,11 @@ inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { return ShAmt < 4 && ShAmt > 0; } -bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, - unsigned Opc, bool AllowSP, - unsigned &NewSrc, bool &isKill, bool &isUndef, +bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, + unsigned Opc, bool AllowSP, unsigned &NewSrc, + bool &isKill, bool &isUndef, MachineOperand &ImplicitOp) const { - MachineFunction &MF = *MI->getParent()->getParent(); + MachineFunction &MF = *MI.getParent()->getParent(); const TargetRegisterClass *RC; if (AllowSP) { RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass; @@ -2571,7 +2694,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, NewSrc = getX86SubSuperRegister(Src.getReg(), 64); MachineBasicBlock::LivenessQueryResult LQR = - MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI); + MI.getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI); switch (LQR) { case MachineBasicBlock::LQR_Unknown: @@ -2579,7 +2702,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, // formation. return false; case MachineBasicBlock::LQR_Live: - isKill = MI->killsRegister(SrcReg); + isKill = MI.killsRegister(SrcReg); isUndef = false; break; default: @@ -2592,9 +2715,8 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, // Virtual register of the wrong class, we have to create a temporary 64-bit // vreg to feed into the LEA. NewSrc = MF.getRegInfo().createVirtualRegister(RC); - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - get(TargetOpcode::COPY)) - .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit) + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY)) + .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit) .addOperand(Src); // Which is obviously going to be dead after we're done with it. @@ -2609,16 +2731,14 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, /// Helper for convertToThreeAddress when 16-bit LEA is disabled, use 32-bit /// LEA to form 3-address code by promoting to a 32-bit superregister and then /// truncating back down to a 16-bit subregister. -MachineInstr * -X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, - MachineFunction::iterator &MFI, - MachineBasicBlock::iterator &MBBI, - LiveVariables *LV) const { - MachineInstr *MI = MBBI; - unsigned Dest = MI->getOperand(0).getReg(); - unsigned Src = MI->getOperand(1).getReg(); - bool isDead = MI->getOperand(0).isDead(); - bool isKill = MI->getOperand(1).isKill(); +MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( + unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI, + LiveVariables *LV) const { + MachineBasicBlock::iterator MBBI = MI.getIterator(); + unsigned Dest = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + bool isDead = MI.getOperand(0).isDead(); + bool isKill = MI.getOperand(1).isKill(); MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo(); unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass); @@ -2638,19 +2758,19 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, // leal -65(%rdx), %esi // But testing has shown this *does* help performance in 64-bit mode (at // least on modern x86 machines). - BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg); + BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg); MachineInstr *InsMI = - BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY)) - .addReg(leaInReg, RegState::Define, X86::sub_16bit) - .addReg(Src, getKillRegState(isKill)); + BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY)) + .addReg(leaInReg, RegState::Define, X86::sub_16bit) + .addReg(Src, getKillRegState(isKill)); - MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(), - get(Opc), leaOutReg); + MachineInstrBuilder MIB = + BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opc), leaOutReg); switch (MIOpc) { default: llvm_unreachable("Unreachable!"); case X86::SHL16ri: { - unsigned ShAmt = MI->getOperand(2).getImm(); - MIB.addReg(0).addImm(1 << ShAmt) + unsigned ShAmt = MI.getOperand(2).getImm(); + MIB.addReg(0).addImm(1ULL << ShAmt) .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0); break; } @@ -2664,12 +2784,12 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, case X86::ADD16ri8: case X86::ADD16ri_DB: case X86::ADD16ri8_DB: - addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm()); + addRegOffset(MIB, leaInReg, true, MI.getOperand(2).getImm()); break; case X86::ADD16rr: case X86::ADD16rr_DB: { - unsigned Src2 = MI->getOperand(2).getReg(); - bool isKill2 = MI->getOperand(2).isKill(); + unsigned Src2 = MI.getOperand(2).getReg(); + bool isKill2 = MI.getOperand(2).isKill(); unsigned leaInReg2 = 0; MachineInstr *InsMI2 = nullptr; if (Src == Src2) { @@ -2683,33 +2803,32 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); // Build and insert into an implicit UNDEF value. This is OK because // well be shifting and then extracting the lower 16-bits. - BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2); - InsMI2 = - BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(TargetOpcode::COPY)) - .addReg(leaInReg2, RegState::Define, X86::sub_16bit) - .addReg(Src2, getKillRegState(isKill2)); + BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2); + InsMI2 = BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY)) + .addReg(leaInReg2, RegState::Define, X86::sub_16bit) + .addReg(Src2, getKillRegState(isKill2)); addRegReg(MIB, leaInReg, true, leaInReg2, true); } if (LV && isKill2 && InsMI2) - LV->replaceKillInstruction(Src2, MI, InsMI2); + LV->replaceKillInstruction(Src2, MI, *InsMI2); break; } } MachineInstr *NewMI = MIB; MachineInstr *ExtMI = - BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY)) - .addReg(Dest, RegState::Define | getDeadRegState(isDead)) - .addReg(leaOutReg, RegState::Kill, X86::sub_16bit); + BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY)) + .addReg(Dest, RegState::Define | getDeadRegState(isDead)) + .addReg(leaOutReg, RegState::Kill, X86::sub_16bit); if (LV) { // Update live variables LV->getVarInfo(leaInReg).Kills.push_back(NewMI); LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI); if (isKill) - LV->replaceKillInstruction(Src, MI, InsMI); + LV->replaceKillInstruction(Src, MI, *InsMI); if (isDead) - LV->replaceKillInstruction(Dest, MI, ExtMI); + LV->replaceKillInstruction(Dest, MI, *ExtMI); } return ExtMI; @@ -2727,20 +2846,17 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, /// MachineInstr * X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, - MachineBasicBlock::iterator &MBBI, - LiveVariables *LV) const { - MachineInstr *MI = MBBI; - + MachineInstr &MI, LiveVariables *LV) const { // The following opcodes also sets the condition code register(s). Only // convert them to equivalent lea if the condition code register def's // are dead! if (hasLiveCondCodeDef(MI)) return nullptr; - MachineFunction &MF = *MI->getParent()->getParent(); + MachineFunction &MF = *MI.getParent()->getParent(); // All instructions input are two-addr instructions. Get the known operands. - const MachineOperand &Dest = MI->getOperand(0); - const MachineOperand &Src = MI->getOperand(1); + const MachineOperand &Dest = MI.getOperand(0); + const MachineOperand &Src = MI.getOperand(1); MachineInstr *NewMI = nullptr; // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When @@ -2749,11 +2865,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, bool DisableLEA16 = true; bool is64Bit = Subtarget.is64Bit(); - unsigned MIOpc = MI->getOpcode(); + unsigned MIOpc = MI.getOpcode(); switch (MIOpc) { default: return nullptr; case X86::SHL64ri: { - assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); + assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; @@ -2763,13 +2879,17 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, &X86::GR64_NOSPRegClass)) return nullptr; - NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) - .addOperand(Dest) - .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); + NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)) + .addOperand(Dest) + .addReg(0) + .addImm(1ULL << ShAmt) + .addOperand(Src) + .addImm(0) + .addReg(0); break; } case X86::SHL32ri: { - assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); + assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; @@ -2783,11 +2903,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, SrcReg, isKill, isUndef, ImplicitOp)) return nullptr; - MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest) - .addReg(0).addImm(1 << ShAmt) - .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)) - .addImm(0).addReg(0); + MachineInstrBuilder MIB = + BuildMI(MF, MI.getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(0) + .addImm(1ULL << ShAmt) + .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)) + .addImm(0) + .addReg(0); if (ImplicitOp.getReg() != 0) MIB.addOperand(ImplicitOp); NewMI = MIB; @@ -2795,20 +2918,25 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, break; } case X86::SHL16ri: { - assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); + assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr; - NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest) - .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV) + : nullptr; + NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest) + .addReg(0) + .addImm(1ULL << ShAmt) + .addOperand(Src) + .addImm(0) + .addReg(0); break; } case X86::INC64r: case X86::INC32r: { - assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); + assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!"); unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r : (is64Bit ? X86::LEA64_32r : X86::LEA32r); bool isKill, isUndef; @@ -2818,9 +2946,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, SrcReg, isKill, isUndef, ImplicitOp)) return nullptr; - MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest) - .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)); + MachineInstrBuilder MIB = + BuildMI(MF, MI.getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, + getKillRegState(isKill) | getUndefRegState(isUndef)); if (ImplicitOp.getReg() != 0) MIB.addOperand(ImplicitOp); @@ -2829,15 +2959,17 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, } case X86::INC16r: if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV) : nullptr; - assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest).addOperand(Src), 1); + assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!"); + NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest) + .addOperand(Src), + 1); break; case X86::DEC64r: case X86::DEC32r: { - assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); + assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!"); unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r : (is64Bit ? X86::LEA64_32r : X86::LEA32r); @@ -2848,9 +2980,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, SrcReg, isKill, isUndef, ImplicitOp)) return nullptr; - MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest) - .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); + MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getUndefRegState(isUndef) | + getKillRegState(isKill)); if (ImplicitOp.getReg() != 0) MIB.addOperand(ImplicitOp); @@ -2860,17 +2993,19 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, } case X86::DEC16r: if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV) : nullptr; - assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest).addOperand(Src), -1); + assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!"); + NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest) + .addOperand(Src), + -1); break; case X86::ADD64rr: case X86::ADD64rr_DB: case X86::ADD32rr: case X86::ADD32rr_DB: { - assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Opc; if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) Opc = X86::LEA64r; @@ -2884,7 +3019,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, SrcReg, isKill, isUndef, ImplicitOp)) return nullptr; - const MachineOperand &Src2 = MI->getOperand(2); + const MachineOperand &Src2 = MI.getOperand(2); bool isKill2, isUndef2; unsigned SrcReg2; MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); @@ -2892,8 +3027,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, SrcReg2, isKill2, isUndef2, ImplicitOp2)) return nullptr; - MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest); + MachineInstrBuilder MIB = + BuildMI(MF, MI.getDebugLoc(), get(Opc)).addOperand(Dest); if (ImplicitOp.getReg() != 0) MIB.addOperand(ImplicitOp); if (ImplicitOp2.getReg() != 0) @@ -2906,45 +3041,46 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, NewMI->getOperand(3).setIsUndef(isUndef2); if (LV && Src2.isKill()) - LV->replaceKillInstruction(SrcReg2, MI, NewMI); + LV->replaceKillInstruction(SrcReg2, MI, *NewMI); break; } case X86::ADD16rr: case X86::ADD16rr_DB: { if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV) : nullptr; - assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - unsigned Src2 = MI->getOperand(2).getReg(); - bool isKill2 = MI->getOperand(2).isKill(); - NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest), - Src.getReg(), Src.isKill(), Src2, isKill2); + assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); + unsigned Src2 = MI.getOperand(2).getReg(); + bool isKill2 = MI.getOperand(2).isKill(); + NewMI = addRegReg( + BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).addOperand(Dest), + Src.getReg(), Src.isKill(), Src2, isKill2); // Preserve undefness of the operands. - bool isUndef = MI->getOperand(1).isUndef(); - bool isUndef2 = MI->getOperand(2).isUndef(); + bool isUndef = MI.getOperand(1).isUndef(); + bool isUndef2 = MI.getOperand(2).isUndef(); NewMI->getOperand(1).setIsUndef(isUndef); NewMI->getOperand(3).setIsUndef(isUndef2); if (LV && isKill2) - LV->replaceKillInstruction(Src2, MI, NewMI); + LV->replaceKillInstruction(Src2, MI, *NewMI); break; } case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD64ri32_DB: case X86::ADD64ri8_DB: - assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) - .addOperand(Dest).addOperand(Src), - MI->getOperand(2).getImm()); + assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); + NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)) + .addOperand(Dest) + .addOperand(Src), + MI.getOperand(2).getImm()); break; case X86::ADD32ri: case X86::ADD32ri8: case X86::ADD32ri_DB: case X86::ADD32ri8_DB: { - assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); + assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; bool isKill, isUndef; @@ -2954,13 +3090,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, SrcReg, isKill, isUndef, ImplicitOp)) return nullptr; - MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) - .addOperand(Dest) - .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); + MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)) + .addOperand(Dest) + .addReg(SrcReg, getUndefRegState(isUndef) | + getKillRegState(isKill)); if (ImplicitOp.getReg() != 0) MIB.addOperand(ImplicitOp); - NewMI = addOffset(MIB, MI->getOperand(2).getImm()); + NewMI = addOffset(MIB, MI.getOperand(2).getImm()); break; } case X86::ADD16ri: @@ -2968,12 +3105,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, case X86::ADD16ri_DB: case X86::ADD16ri8_DB: if (DisableLEA16) - return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) + return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV) : nullptr; - assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); - NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) - .addOperand(Dest).addOperand(Src), - MI->getOperand(2).getImm()); + assert(MI.getNumOperands() >= 3 && "Unknown add instruction!"); + NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)) + .addOperand(Dest) + .addOperand(Src), + MI.getOperand(2).getImm()); break; } @@ -2981,12 +3119,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, if (LV) { // Update live variables if (Src.isKill()) - LV->replaceKillInstruction(Src.getReg(), MI, NewMI); + LV->replaceKillInstruction(Src.getReg(), MI, *NewMI); if (Dest.isDead()) - LV->replaceKillInstruction(Dest.getReg(), MI, NewMI); + LV->replaceKillInstruction(Dest.getReg(), MI, *NewMI); } - MFI->insert(MBBI, NewMI); // Insert the new inst + MFI->insert(MI.getIterator(), NewMI); // Insert the new inst return NewMI; } @@ -3142,11 +3280,16 @@ static bool isFMA3(unsigned Opcode, bool *IsIntrinsic = nullptr) { llvm_unreachable("Opcode not handled by the switch"); } -MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI, - bool NewMI, +MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const { - switch (MI->getOpcode()) { + auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & { + if (NewMI) + return *MI.getParent()->getParent()->CloneMachineInstr(&MI); + return MI; + }; + + switch (MI.getOpcode()) { case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I) case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I) case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I) @@ -3155,7 +3298,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI, case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I) unsigned Opc; unsigned Size; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break; case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break; @@ -3164,15 +3307,12 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI, case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break; case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break; } - unsigned Amt = MI->getOperand(3).getImm(); - if (NewMI) { - MachineFunction &MF = *MI->getParent()->getParent(); - MI = MF.CloneMachineInstr(MI); - NewMI = false; - } - MI->setDesc(get(Opc)); - MI->getOperand(3).setImm(Size-Amt); - return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + unsigned Amt = MI.getOperand(3).getImm(); + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(Opc)); + WorkingMI.getOperand(3).setImm(Size - Amt); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); } case X86::BLENDPDrri: case X86::BLENDPSrri: @@ -3186,7 +3326,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI, case X86::VPBLENDDYrri: case X86::VPBLENDWYrri:{ unsigned Mask; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::BLENDPDrri: Mask = 0x03; break; case X86::BLENDPSrri: Mask = 0x0F; break; @@ -3201,29 +3341,23 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI, case X86::VPBLENDWYrri: Mask = 0xFF; break; } // Only the least significant bits of Imm are used. - unsigned Imm = MI->getOperand(3).getImm() & Mask; - if (NewMI) { - MachineFunction &MF = *MI->getParent()->getParent(); - MI = MF.CloneMachineInstr(MI); - NewMI = false; - } - MI->getOperand(3).setImm(Mask ^ Imm); - return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + unsigned Imm = MI.getOperand(3).getImm() & Mask; + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.getOperand(3).setImm(Mask ^ Imm); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); } case X86::PCLMULQDQrr: case X86::VPCLMULQDQrr:{ // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0] // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0] - unsigned Imm = MI->getOperand(3).getImm(); + unsigned Imm = MI.getOperand(3).getImm(); unsigned Src1Hi = Imm & 0x01; unsigned Src2Hi = Imm & 0x10; - if (NewMI) { - MachineFunction &MF = *MI->getParent()->getParent(); - MI = MF.CloneMachineInstr(MI); - NewMI = false; - } - MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4)); - return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); } case X86::CMPPDrri: case X86::CMPPSrri: @@ -3233,17 +3367,12 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI, case X86::VCMPPSYrri: { // Float comparison can be safely commuted for // Ordered/Unordered/Equal/NotEqual tests - unsigned Imm = MI->getOperand(3).getImm() & 0x7; + unsigned Imm = MI.getOperand(3).getImm() & 0x7; switch (Imm) { case 0x00: // EQUAL case 0x03: // UNORDERED case 0x04: // NOT EQUAL case 0x07: // ORDERED - if (NewMI) { - MachineFunction &MF = *MI->getParent()->getParent(); - MI = MF.CloneMachineInstr(MI); - NewMI = false; - } return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); default: return nullptr; @@ -3254,7 +3383,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI, case X86::VPCOMQri: case X86::VPCOMUQri: case X86::VPCOMWri: case X86::VPCOMUWri: { // Flip comparison mode immediate (if necessary). - unsigned Imm = MI->getOperand(3).getImm() & 0x7; + unsigned Imm = MI.getOperand(3).getImm() & 0x7; switch (Imm) { case 0x00: Imm = 0x02; break; // LT -> GT case 0x01: Imm = 0x03; break; // LE -> GE @@ -3267,13 +3396,21 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI, default: break; } - if (NewMI) { - MachineFunction &MF = *MI->getParent()->getParent(); - MI = MF.CloneMachineInstr(MI); - NewMI = false; - } - MI->getOperand(3).setImm(Imm); - return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.getOperand(3).setImm(Imm); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } + case X86::VPERM2F128rr: + case X86::VPERM2I128rr: { + // Flip permute source immediate. + // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi. + // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi. + unsigned Imm = MI.getOperand(3).getImm() & 0xFF; + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.getOperand(3).setImm(Imm ^ 0x22); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: @@ -3292,7 +3429,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI, case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr: case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: { unsigned Opc; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break; case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break; @@ -3343,31 +3480,27 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI, case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break; case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break; } - if (NewMI) { - MachineFunction &MF = *MI->getParent()->getParent(); - MI = MF.CloneMachineInstr(MI); - NewMI = false; - } - MI->setDesc(get(Opc)); - // Fallthrough intended. + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(Opc)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); } default: - if (isFMA3(MI->getOpcode())) { + if (isFMA3(MI.getOpcode())) { unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2); if (Opc == 0) return nullptr; - if (NewMI) { - MachineFunction &MF = *MI->getParent()->getParent(); - MI = MF.CloneMachineInstr(MI); - NewMI = false; - } - MI->setDesc(get(Opc)); + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(Opc)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); } + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } } -bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI, +bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { @@ -3402,12 +3535,12 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI, // CommutableOpIdx2 is well defined now. Let's choose another commutable // operand and assign its index to CommutableOpIdx1. - unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg(); + unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg(); for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) { // The commuted operands must have different registers. // Otherwise, the commute transformation does not change anything and // is useless then. - if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg()) + if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg()) break; } @@ -3427,14 +3560,13 @@ bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI, return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0; } -unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI, - unsigned SrcOpIdx1, - unsigned SrcOpIdx2) const { - unsigned Opc = MI->getOpcode(); +unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands( + MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2) const { + unsigned Opc = MI.getOpcode(); // Define the array that holds FMA opcodes in groups // of 3 opcodes(132, 213, 231) in each group. - static const unsigned RegularOpcodeGroups[][3] = { + static const uint16_t RegularOpcodeGroups[][3] = { { X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r }, { X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r }, { X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r }, @@ -3508,7 +3640,7 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI, // Define the array that holds FMA*_Int opcodes in groups // of 3 opcodes(132, 213, 231) in each group. - static const unsigned IntrinOpcodeGroups[][3] = { + static const uint16_t IntrinOpcodeGroups[][3] = { { X86::VFMADDSSr132r_Int, X86::VFMADDSSr213r_Int, X86::VFMADDSSr231r_Int }, { X86::VFMADDSDr132r_Int, X86::VFMADDSDr213r_Int, X86::VFMADDSDr231r_Int }, { X86::VFMADDSSr132m_Int, X86::VFMADDSSr213m_Int, X86::VFMADDSSr231m_Int }, @@ -3539,7 +3671,7 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI, isFMA3(Opc, &IsIntrinOpcode); size_t GroupsNum; - const unsigned (*OpcodeGroups)[3]; + const uint16_t (*OpcodeGroups)[3]; if (IsIntrinOpcode) { GroupsNum = array_lengthof(IntrinOpcodeGroups); OpcodeGroups = IntrinOpcodeGroups; @@ -3548,7 +3680,7 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI, OpcodeGroups = RegularOpcodeGroups; } - const unsigned *FoundOpcodesGroup = nullptr; + const uint16_t *FoundOpcodesGroup = nullptr; size_t FormIndex; // Look for the input opcode in the corresponding opcodes table. @@ -3616,34 +3748,33 @@ unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI, return FoundOpcodesGroup[FormIndex]; } -bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, - unsigned &SrcOpIdx1, +bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { - switch (MI->getOpcode()) { - case X86::CMPPDrri: - case X86::CMPPSrri: - case X86::VCMPPDrri: - case X86::VCMPPSrri: - case X86::VCMPPDYrri: - case X86::VCMPPSYrri: { - // Float comparison can be safely commuted for - // Ordered/Unordered/Equal/NotEqual tests - unsigned Imm = MI->getOperand(3).getImm() & 0x7; - switch (Imm) { - case 0x00: // EQUAL - case 0x03: // UNORDERED - case 0x04: // NOT EQUAL - case 0x07: // ORDERED - // The indices of the commutable operands are 1 and 2. - // Assign them to the returned operand indices here. - return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2); - } - return false; + switch (MI.getOpcode()) { + case X86::CMPPDrri: + case X86::CMPPSrri: + case X86::VCMPPDrri: + case X86::VCMPPSrri: + case X86::VCMPPDYrri: + case X86::VCMPPSYrri: { + // Float comparison can be safely commuted for + // Ordered/Unordered/Equal/NotEqual tests + unsigned Imm = MI.getOperand(3).getImm() & 0x7; + switch (Imm) { + case 0x00: // EQUAL + case 0x03: // UNORDERED + case 0x04: // NOT EQUAL + case 0x07: // ORDERED + // The indices of the commutable operands are 1 and 2. + // Assign them to the returned operand indices here. + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2); } - default: - if (isFMA3(MI->getOpcode())) - return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); - return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + return false; + } + default: + if (isFMA3(MI.getOpcode())) + return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); } return false; } @@ -3791,6 +3922,8 @@ X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { case X86::COND_NP: return X86::COND_P; case X86::COND_O: return X86::COND_NO; case X86::COND_NO: return X86::COND_O; + case X86::COND_NE_OR_P: return X86::COND_E_AND_NP; + case X86::COND_E_AND_NP: return X86::COND_NE_OR_P; } } @@ -3887,17 +4020,38 @@ unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes, } } -bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { - if (!MI->isTerminator()) return false; +bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const { + if (!MI.isTerminator()) return false; // Conditional branch is a special case. - if (MI->isBranch() && !MI->isBarrier()) + if (MI.isBranch() && !MI.isBarrier()) return true; - if (!MI->isPredicable()) + if (!MI.isPredicable()) return true; return !isPredicated(MI); } +// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may +// not be a fallthrough MBB now due to layout changes). Return nullptr if the +// fallthrough MBB cannot be identified. +static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB, + MachineBasicBlock *TBB) { + // Look for non-EHPad successors other than TBB. If we find exactly one, it + // is the fallthrough MBB. If we find zero, then TBB is both the target MBB + // and fallthrough MBB. If we find more than one, we cannot identify the + // fallthrough MBB and should return nullptr. + MachineBasicBlock *FallthroughBB = nullptr; + for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) { + if ((*SI)->isEHPad() || (*SI == TBB && FallthroughBB)) + continue; + // Return a nullptr if we found more than one fallthrough successor. + if (FallthroughBB && FallthroughBB != TBB) + return nullptr; + FallthroughBB = *SI; + } + return FallthroughBB; +} + bool X86InstrInfo::AnalyzeBranchImpl( MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, @@ -3914,7 +4068,7 @@ bool X86InstrInfo::AnalyzeBranchImpl( // Working from the bottom, when we see a non-terminator instruction, we're // done. - if (!isUnpredicatedTerminator(I)) + if (!isUnpredicatedTerminator(*I)) break; // A terminator that isn't a branch can't easily be handled by this @@ -4000,7 +4154,7 @@ bool X86InstrInfo::AnalyzeBranchImpl( FBB = TBB; TBB = I->getOperand(0).getMBB(); Cond.push_back(MachineOperand::CreateImm(BranchCode)); - CondBranches.push_back(I); + CondBranches.push_back(&*I); continue; } @@ -4010,41 +4164,56 @@ bool X86InstrInfo::AnalyzeBranchImpl( assert(Cond.size() == 1); assert(TBB); - // Only handle the case where all conditional branches branch to the same - // destination. - if (TBB != I->getOperand(0).getMBB()) - return true; - // If the conditions are the same, we can leave them alone. X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm(); - if (OldBranchCode == BranchCode) + auto NewTBB = I->getOperand(0).getMBB(); + if (OldBranchCode == BranchCode && TBB == NewTBB) continue; // If they differ, see if they fit one of the known patterns. Theoretically, // we could handle more patterns here, but we shouldn't expect to see them // if instruction selection has done a reasonable job. - if ((OldBranchCode == X86::COND_NP && - BranchCode == X86::COND_E) || - (OldBranchCode == X86::COND_E && - BranchCode == X86::COND_NP)) - BranchCode = X86::COND_NP_OR_E; - else if ((OldBranchCode == X86::COND_P && - BranchCode == X86::COND_NE) || - (OldBranchCode == X86::COND_NE && - BranchCode == X86::COND_P)) + if (TBB == NewTBB && + ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) || + (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) { BranchCode = X86::COND_NE_OR_P; - else + } else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) || + (OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) { + if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB))) + return true; + + // X86::COND_E_AND_NP usually has two different branch destinations. + // + // JP B1 + // JE B2 + // JMP B1 + // B1: + // B2: + // + // Here this condition branches to B2 only if NP && E. It has another + // equivalent form: + // + // JNE B1 + // JNP B2 + // JMP B1 + // B1: + // B2: + // + // Similarly it branches to B2 only if E && NP. That is why this condition + // is named with COND_E_AND_NP. + BranchCode = X86::COND_E_AND_NP; + } else return true; // Update the MachineOperand. Cond[0].setImm(BranchCode); - CondBranches.push_back(I); + CondBranches.push_back(&*I); } return false; } -bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, +bool X86InstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, @@ -4053,7 +4222,7 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify); } -bool X86InstrInfo::AnalyzeBranchPredicate(MachineBasicBlock &MBB, +bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB, MachineBranchPredicate &MBP, bool AllowModify) const { using namespace std::placeholders; @@ -4142,10 +4311,11 @@ unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { return Count; } -unsigned -X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, - MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, - DebugLoc DL) const { +unsigned X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, + const DebugLoc &DL) const { // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); assert((Cond.size() == 1 || Cond.size() == 0) && @@ -4158,17 +4328,13 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, return 1; } + // If FBB is null, it is implied to be a fall-through block. + bool FallThru = FBB == nullptr; + // Conditional branch. unsigned Count = 0; X86::CondCode CC = (X86::CondCode)Cond[0].getImm(); switch (CC) { - case X86::COND_NP_OR_E: - // Synthesize NP_OR_E with two branches. - BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB); - ++Count; - BuildMI(&MBB, DL, get(X86::JE_1)).addMBB(TBB); - ++Count; - break; case X86::COND_NE_OR_P: // Synthesize NE_OR_P with two branches. BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB); @@ -4176,13 +4342,26 @@ X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB); ++Count; break; + case X86::COND_E_AND_NP: + // Use the next block of MBB as FBB if it is null. + if (FBB == nullptr) { + FBB = getFallThroughMBB(&MBB, TBB); + assert(FBB && "MBB cannot be the last block in function when the false " + "body is a fall-through."); + } + // Synthesize COND_E_AND_NP with two branches. + BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(FBB); + ++Count; + BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB); + ++Count; + break; default: { unsigned Opc = GetCondBranchFromCond(CC); BuildMI(&MBB, DL, get(Opc)).addMBB(TBB); ++Count; } } - if (FBB) { + if (!FallThru) { // Two-way Conditional branch. Insert the second branch. BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB); ++Count; @@ -4228,15 +4407,16 @@ canInsertSelect(const MachineBasicBlock &MBB, } void X86InstrInfo::insertSelect(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DstReg, ArrayRef<MachineOperand> Cond, - unsigned TrueReg, unsigned FalseReg) const { - MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - assert(Cond.size() == 1 && "Invalid Cond array"); - unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(), - MRI.getRegClass(DstReg)->getSize(), - false/*HasMemoryOperand*/); - BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg); + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DstReg, + ArrayRef<MachineOperand> Cond, unsigned TrueReg, + unsigned FalseReg) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + assert(Cond.size() == 1 && "Invalid Cond array"); + unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(), + MRI.getRegClass(DstReg)->getSize(), + false /*HasMemoryOperand*/); + BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg); } /// Test if the given register is a physical h register. @@ -4258,16 +4438,18 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, if (X86::GR64RegClass.contains(DestReg)) { if (X86::VR128XRegClass.contains(SrcReg)) // Copy from a VR128 register to a GR64 register. - return HasAVX512 ? X86::VMOVPQIto64Zrr: (HasAVX ? X86::VMOVPQIto64rr : - X86::MOVPQIto64rr); + return HasAVX512 ? X86::VMOVPQIto64Zrr : + HasAVX ? X86::VMOVPQIto64rr : + X86::MOVPQIto64rr; if (X86::VR64RegClass.contains(SrcReg)) // Copy from a VR64 register to a GR64 register. return X86::MMX_MOVD64from64rr; } else if (X86::GR64RegClass.contains(SrcReg)) { // Copy from a GR64 register to a VR128 register. if (X86::VR128XRegClass.contains(DestReg)) - return HasAVX512 ? X86::VMOV64toPQIZrr: (HasAVX ? X86::VMOV64toPQIrr : - X86::MOV64toPQIrr); + return HasAVX512 ? X86::VMOV64toPQIZrr : + HasAVX ? X86::VMOV64toPQIrr : + X86::MOV64toPQIrr; // Copy from a GR64 register to a VR64 register. if (X86::VR64RegClass.contains(DestReg)) return X86::MMX_MOVD64to64rr; @@ -4276,22 +4458,30 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, // SrcReg(FR32) -> DestReg(GR32) // SrcReg(GR32) -> DestReg(FR32) - if (X86::GR32RegClass.contains(DestReg) && X86::FR32XRegClass.contains(SrcReg)) + if (X86::GR32RegClass.contains(DestReg) && + X86::FR32XRegClass.contains(SrcReg)) // Copy from a FR32 register to a GR32 register. - return HasAVX512 ? X86::VMOVSS2DIZrr : (HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr); + return HasAVX512 ? X86::VMOVSS2DIZrr : + HasAVX ? X86::VMOVSS2DIrr : + X86::MOVSS2DIrr; - if (X86::FR32XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg)) + if (X86::FR32XRegClass.contains(DestReg) && + X86::GR32RegClass.contains(SrcReg)) // Copy from a GR32 register to a FR32 register. - return HasAVX512 ? X86::VMOVDI2SSZrr : (HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr); + return HasAVX512 ? X86::VMOVDI2SSZrr : + HasAVX ? X86::VMOVDI2SSrr : + X86::MOVDI2SSrr; return 0; } +static bool isMaskRegClass(const TargetRegisterClass *RC) { + // All KMASK RegClasses hold the same k registers, can be tested against anyone. + return X86::VK16RegClass.hasSubClassEq(RC); +} + static bool MaskRegClassContains(unsigned Reg) { - return X86::VK8RegClass.contains(Reg) || - X86::VK16RegClass.contains(Reg) || - X86::VK32RegClass.contains(Reg) || - X86::VK64RegClass.contains(Reg) || - X86::VK1RegClass.contains(Reg); + // All KMASK RegClasses hold the same k registers, can be tested against anyone. + return X86::VK16RegClass.contains(Reg); } static bool GRRegClassContains(unsigned Reg) { @@ -4338,13 +4528,22 @@ unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg, if (Subtarget.hasBWI()) if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg)) return Opc; - if (X86::VR128XRegClass.contains(DestReg, SrcReg) || - X86::VR256XRegClass.contains(DestReg, SrcReg) || - X86::VR512RegClass.contains(DestReg, SrcReg)) { - DestReg = get512BitSuperRegister(DestReg); - SrcReg = get512BitSuperRegister(SrcReg); + if (X86::VR128XRegClass.contains(DestReg, SrcReg)) { + if (Subtarget.hasVLX()) + return X86::VMOVAPSZ128rr; + DestReg = get512BitSuperRegister(DestReg); + SrcReg = get512BitSuperRegister(SrcReg); + return X86::VMOVAPSZrr; + } + if (X86::VR256XRegClass.contains(DestReg, SrcReg)) { + if (Subtarget.hasVLX()) + return X86::VMOVAPSZ256rr; + DestReg = get512BitSuperRegister(DestReg); + SrcReg = get512BitSuperRegister(SrcReg); + return X86::VMOVAPSZrr; + } + if (X86::VR512RegClass.contains(DestReg, SrcReg)) return X86::VMOVAPSZrr; - } if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) return X86::KMOVWkk; if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) { @@ -4359,9 +4558,9 @@ unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg, } void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { + MachineBasicBlock::iterator MI, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { // First deal with the normal symmetric copies. bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); @@ -4455,22 +4654,33 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // first frame index. // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment. - - bool AXDead = (Reg == AX) || - (MachineBasicBlock::LQR_Dead == - MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI)); - if (!AXDead) { - // FIXME: If computeRegisterLiveness() reported LQR_Unknown then AX may - // actually be dead. This is not a problem for correctness as we are just - // (unnecessarily) saving+restoring a dead register. However the - // MachineVerifier expects operands that read from dead registers - // to be marked with the "undef" flag. - // An example of this can be found in - // test/CodeGen/X86/peephole-na-phys-copy-folding.ll and - // test/CodeGen/X86/cmpxchg-clobber-flags.ll when using - // -verify-machineinstrs. - BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true)); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + MachineBasicBlock::LivenessQueryResult LQR = + MBB.computeRegisterLiveness(TRI, AX, MI); + // We do not want to save and restore AX if we do not have to. + // Moreover, if we do so whereas AX is dead, we would need to set + // an undef flag on the use of AX, otherwise the verifier will + // complain that we read an undef value. + // We do not want to change the behavior of the machine verifier + // as this is usually wrong to read an undef value. + if (MachineBasicBlock::LQR_Unknown == LQR) { + LivePhysRegs LPR(TRI); + LPR.addLiveOuts(MBB); + MachineBasicBlock::iterator I = MBB.end(); + while (I != MI) { + --I; + LPR.stepBackward(*I); + } + // AX contains the top most register in the aliasing hierarchy. + // It may not be live, but one of its aliases may be. + for (MCRegAliasIterator AI(AX, TRI, true); + AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI) + LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live + : MachineBasicBlock::LQR_Dead; } + bool AXDead = (Reg == AX) || (MachineBasicBlock::LQR_Dead == LQR); + if (!AXDead) + BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true)); if (FromEFLAGS) { BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL); BuildMI(MBB, MI, DL, get(X86::LAHF)); @@ -4493,15 +4703,28 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, llvm_unreachable("Cannot emit physreg copy instruction"); } +static unsigned getLoadStoreMaskRegOpcode(const TargetRegisterClass *RC, + bool load) { + switch (RC->getSize()) { + default: + llvm_unreachable("Unknown spill size"); + case 2: + return load ? X86::KMOVWkm : X86::KMOVWmk; + case 4: + return load ? X86::KMOVDkm : X86::KMOVDmk; + case 8: + return load ? X86::KMOVQkm : X86::KMOVQmk; + } +} + static unsigned getLoadStoreRegOpcode(unsigned Reg, const TargetRegisterClass *RC, bool isStackAligned, const X86Subtarget &STI, bool load) { if (STI.hasAVX512()) { - if (X86::VK8RegClass.hasSubClassEq(RC) || - X86::VK16RegClass.hasSubClassEq(RC)) - return load ? X86::KMOVWkm : X86::KMOVWmk; + if (isMaskRegClass(RC)) + return getLoadStoreMaskRegOpcode(RC, load); if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC)) return load ? X86::VMOVSSZrm : X86::VMOVSSZmr; if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC)) @@ -4554,25 +4777,38 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, assert((X86::VR128RegClass.hasSubClassEq(RC) || X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass"); // If stack is realigned we can use aligned stores. + if (X86::VR128RegClass.hasSubClassEq(RC)) { + if (isStackAligned) + return load ? (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) + : (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr); + else + return load ? (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) + : (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); + } + assert(STI.hasVLX() && "Using extended register requires VLX"); if (isStackAligned) - return load ? - (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) : - (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr); + return load ? X86::VMOVAPSZ128rm : X86::VMOVAPSZ128mr; else - return load ? - (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) : - (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); + return load ? X86::VMOVUPSZ128rm : X86::VMOVUPSZ128mr; } case 32: assert((X86::VR256RegClass.hasSubClassEq(RC) || X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass"); // If stack is realigned we can use aligned stores. + if (X86::VR256RegClass.hasSubClassEq(RC)) { + if (isStackAligned) + return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr; + else + return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr; + } + assert(STI.hasVLX() && "Using extended register requires VLX"); if (isStackAligned) - return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr; + return load ? X86::VMOVAPSZ256rm : X86::VMOVAPSZ256mr; else - return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr; + return load ? X86::VMOVUPSZ256rm : X86::VMOVUPSZ256mr; case 64: assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass"); + assert(STI.hasVLX() && "Using 512-bit register requires AVX512"); if (isStackAligned) return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr; else @@ -4580,25 +4816,29 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg, } } -bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *MemOp, unsigned &BaseReg, - unsigned &Offset, +bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg, + int64_t &Offset, const TargetRegisterInfo *TRI) const { - const MCInstrDesc &Desc = MemOp->getDesc(); - int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags, MemOp->getOpcode()); + const MCInstrDesc &Desc = MemOp.getDesc(); + int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags); if (MemRefBegin < 0) return false; MemRefBegin += X86II::getOperandBias(Desc); - BaseReg = MemOp->getOperand(MemRefBegin + X86::AddrBaseReg).getReg(); - if (MemOp->getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1) + MachineOperand &BaseMO = MemOp.getOperand(MemRefBegin + X86::AddrBaseReg); + if (!BaseMO.isReg()) // Can be an MO_FrameIndex + return false; + + BaseReg = BaseMO.getReg(); + if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1) return false; - if (MemOp->getOperand(MemRefBegin + X86::AddrIndexReg).getReg() != + if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() != X86::NoRegister) return false; - const MachineOperand &DispMO = MemOp->getOperand(MemRefBegin + X86::AddrDisp); + const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp); // Displacement can be symbolic if (!DispMO.isImm()) @@ -4606,8 +4846,8 @@ bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *MemOp, unsigned &BaseReg, Offset = DispMO.getImm(); - return (MemOp->getOperand(MemRefBegin + X86::AddrIndexReg).getReg() == - X86::NoRegister); + return MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() == + X86::NoRegister; } static unsigned getStoreRegOpcode(unsigned SrcReg, @@ -4697,10 +4937,10 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, NewMIs.push_back(MIB); } -bool X86InstrInfo:: -analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, - int &CmpMask, int &CmpValue) const { - switch (MI->getOpcode()) { +bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, + unsigned &SrcReg2, int &CmpMask, + int &CmpValue) const { + switch (MI.getOpcode()) { default: break; case X86::CMP64ri32: case X86::CMP64ri8: @@ -4709,17 +4949,17 @@ analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, case X86::CMP16ri: case X86::CMP16ri8: case X86::CMP8ri: - SrcReg = MI->getOperand(0).getReg(); + SrcReg = MI.getOperand(0).getReg(); SrcReg2 = 0; CmpMask = ~0; - CmpValue = MI->getOperand(1).getImm(); + CmpValue = MI.getOperand(1).getImm(); return true; // A SUB can be used to perform comparison. case X86::SUB64rm: case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm: - SrcReg = MI->getOperand(1).getReg(); + SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; CmpValue = 0; @@ -4728,8 +4968,8 @@ analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, case X86::SUB32rr: case X86::SUB16rr: case X86::SUB8rr: - SrcReg = MI->getOperand(1).getReg(); - SrcReg2 = MI->getOperand(2).getReg(); + SrcReg = MI.getOperand(1).getReg(); + SrcReg2 = MI.getOperand(2).getReg(); CmpMask = ~0; CmpValue = 0; return true; @@ -4740,17 +4980,17 @@ analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB8ri: - SrcReg = MI->getOperand(1).getReg(); + SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; - CmpValue = MI->getOperand(2).getImm(); + CmpValue = MI.getOperand(2).getImm(); return true; case X86::CMP64rr: case X86::CMP32rr: case X86::CMP16rr: case X86::CMP8rr: - SrcReg = MI->getOperand(0).getReg(); - SrcReg2 = MI->getOperand(1).getReg(); + SrcReg = MI.getOperand(0).getReg(); + SrcReg2 = MI.getOperand(1).getReg(); CmpMask = ~0; CmpValue = 0; return true; @@ -4758,8 +4998,9 @@ analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, case X86::TEST16rr: case X86::TEST32rr: case X86::TEST64rr: - SrcReg = MI->getOperand(0).getReg(); - if (MI->getOperand(1).getReg() != SrcReg) return false; + SrcReg = MI.getOperand(0).getReg(); + if (MI.getOperand(1).getReg() != SrcReg) + return false; // Compare against zero. SrcReg2 = 0; CmpMask = ~0; @@ -4775,47 +5016,40 @@ analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, /// This function can be extended later on. /// SrcReg, SrcRegs: register operands for FlagI. /// ImmValue: immediate for FlagI if it takes an immediate. -inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg, +inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg, unsigned SrcReg2, int ImmValue, - MachineInstr *OI) { - if (((FlagI->getOpcode() == X86::CMP64rr && - OI->getOpcode() == X86::SUB64rr) || - (FlagI->getOpcode() == X86::CMP32rr && - OI->getOpcode() == X86::SUB32rr)|| - (FlagI->getOpcode() == X86::CMP16rr && - OI->getOpcode() == X86::SUB16rr)|| - (FlagI->getOpcode() == X86::CMP8rr && - OI->getOpcode() == X86::SUB8rr)) && - ((OI->getOperand(1).getReg() == SrcReg && - OI->getOperand(2).getReg() == SrcReg2) || - (OI->getOperand(1).getReg() == SrcReg2 && - OI->getOperand(2).getReg() == SrcReg))) + MachineInstr &OI) { + if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) || + (FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) || + (FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) || + (FlagI.getOpcode() == X86::CMP8rr && OI.getOpcode() == X86::SUB8rr)) && + ((OI.getOperand(1).getReg() == SrcReg && + OI.getOperand(2).getReg() == SrcReg2) || + (OI.getOperand(1).getReg() == SrcReg2 && + OI.getOperand(2).getReg() == SrcReg))) return true; - if (((FlagI->getOpcode() == X86::CMP64ri32 && - OI->getOpcode() == X86::SUB64ri32) || - (FlagI->getOpcode() == X86::CMP64ri8 && - OI->getOpcode() == X86::SUB64ri8) || - (FlagI->getOpcode() == X86::CMP32ri && - OI->getOpcode() == X86::SUB32ri) || - (FlagI->getOpcode() == X86::CMP32ri8 && - OI->getOpcode() == X86::SUB32ri8) || - (FlagI->getOpcode() == X86::CMP16ri && - OI->getOpcode() == X86::SUB16ri) || - (FlagI->getOpcode() == X86::CMP16ri8 && - OI->getOpcode() == X86::SUB16ri8) || - (FlagI->getOpcode() == X86::CMP8ri && - OI->getOpcode() == X86::SUB8ri)) && - OI->getOperand(1).getReg() == SrcReg && - OI->getOperand(2).getImm() == ImmValue) + if (((FlagI.getOpcode() == X86::CMP64ri32 && + OI.getOpcode() == X86::SUB64ri32) || + (FlagI.getOpcode() == X86::CMP64ri8 && + OI.getOpcode() == X86::SUB64ri8) || + (FlagI.getOpcode() == X86::CMP32ri && OI.getOpcode() == X86::SUB32ri) || + (FlagI.getOpcode() == X86::CMP32ri8 && + OI.getOpcode() == X86::SUB32ri8) || + (FlagI.getOpcode() == X86::CMP16ri && OI.getOpcode() == X86::SUB16ri) || + (FlagI.getOpcode() == X86::CMP16ri8 && + OI.getOpcode() == X86::SUB16ri8) || + (FlagI.getOpcode() == X86::CMP8ri && OI.getOpcode() == X86::SUB8ri)) && + OI.getOperand(1).getReg() == SrcReg && + OI.getOperand(2).getImm() == ImmValue) return true; return false; } /// Check whether the definition can be converted /// to remove a comparison against zero. -inline static bool isDefConvertible(MachineInstr *MI) { - switch (MI->getOpcode()) { +inline static bool isDefConvertible(MachineInstr &MI) { + switch (MI.getOpcode()) { default: return false; // The shift instructions only modify ZF if their shift count is non-zero. @@ -4899,8 +5133,8 @@ inline static bool isDefConvertible(MachineInstr *MI) { } /// Check whether the use can be converted to remove a comparison against zero. -static X86::CondCode isUseDefConvertible(MachineInstr *MI) { - switch (MI->getOpcode()) { +static X86::CondCode isUseDefConvertible(MachineInstr &MI) { + switch (MI.getOpcode()) { default: return X86::COND_INVALID; case X86::LZCNT16rr: case X86::LZCNT16rm: case X86::LZCNT32rr: case X86::LZCNT32rm: @@ -4920,13 +5154,13 @@ static X86::CondCode isUseDefConvertible(MachineInstr *MI) { /// Check if there exists an earlier instruction that /// operates on the same source operands and sets flags in the same way as /// Compare; remove Compare if possible. -bool X86InstrInfo:: -optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, - int CmpMask, int CmpValue, - const MachineRegisterInfo *MRI) const { +bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, + unsigned SrcReg2, int CmpMask, + int CmpValue, + const MachineRegisterInfo *MRI) const { // Check whether we can replace SUB with CMP. unsigned NewOpcode = 0; - switch (CmpInstr->getOpcode()) { + switch (CmpInstr.getOpcode()) { default: break; case X86::SUB64ri32: case X86::SUB64ri8: @@ -4943,10 +5177,10 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, case X86::SUB32rr: case X86::SUB16rr: case X86::SUB8rr: { - if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg())) + if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) return false; // There is no use of the destination register, we can replace SUB with CMP. - switch (CmpInstr->getOpcode()) { + switch (CmpInstr.getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::SUB64rm: NewOpcode = X86::CMP64rm; break; case X86::SUB32rm: NewOpcode = X86::CMP32rm; break; @@ -4964,8 +5198,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break; case X86::SUB8ri: NewOpcode = X86::CMP8ri; break; } - CmpInstr->setDesc(get(NewOpcode)); - CmpInstr->RemoveOperand(0); + CmpInstr.setDesc(get(NewOpcode)); + CmpInstr.RemoveOperand(0); // Fall through to optimize Cmp if Cmp is CMPrr or CMPri. if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm || NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm) @@ -4983,7 +5217,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // If we are comparing against zero, check whether we can use MI to update // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize. bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0); - if (IsCmpZero && MI->getParent() != CmpInstr->getParent()) + if (IsCmpZero && MI->getParent() != CmpInstr.getParent()) return false; // If we have a use of the source register between the def and our compare @@ -4991,19 +5225,20 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // right way. bool ShouldUpdateCC = false; X86::CondCode NewCC = X86::COND_INVALID; - if (IsCmpZero && !isDefConvertible(MI)) { + if (IsCmpZero && !isDefConvertible(*MI)) { // Scan forward from the use until we hit the use we're looking for or the // compare instruction. for (MachineBasicBlock::iterator J = MI;; ++J) { // Do we have a convertible instruction? - NewCC = isUseDefConvertible(J); + NewCC = isUseDefConvertible(*J); if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() && J->getOperand(1).getReg() == SrcReg) { assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!"); ShouldUpdateCC = true; // Update CC later on. // This is not a def of SrcReg, but still a def of EFLAGS. Keep going // with the new def. - MI = Def = J; + Def = J; + MI = &*Def; break; } @@ -5024,29 +5259,29 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // otherwise, RE is the rend of the basic block. MachineBasicBlock::reverse_iterator RI = MachineBasicBlock::reverse_iterator(I), - RE = CmpInstr->getParent() == MI->getParent() ? - MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ : - CmpInstr->getParent()->rend(); + RE = CmpInstr.getParent() == MI->getParent() + ? MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ + : CmpInstr.getParent()->rend(); MachineInstr *Movr0Inst = nullptr; for (; RI != RE; ++RI) { - MachineInstr *Instr = &*RI; + MachineInstr &Instr = *RI; // Check whether CmpInstr can be made redundant by the current instruction. if (!IsCmpZero && isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) { - Sub = Instr; + Sub = &Instr; break; } - if (Instr->modifiesRegister(X86::EFLAGS, TRI) || - Instr->readsRegister(X86::EFLAGS, TRI)) { + if (Instr.modifiesRegister(X86::EFLAGS, TRI) || + Instr.readsRegister(X86::EFLAGS, TRI)) { // This instruction modifies or uses EFLAGS. // MOV32r0 etc. are implemented with xor which clobbers condition code. // They are safe to move up, if the definition to EFLAGS is dead and // earlier instructions do not read or write EFLAGS. - if (!Movr0Inst && Instr->getOpcode() == X86::MOV32r0 && - Instr->registerDefIsDead(X86::EFLAGS, TRI)) { - Movr0Inst = Instr; + if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 && + Instr.registerDefIsDead(X86::EFLAGS, TRI)) { + Movr0Inst = &Instr; continue; } @@ -5068,7 +5303,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // live-out. bool IsSafe = false; SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate; - MachineBasicBlock::iterator E = CmpInstr->getParent()->end(); + MachineBasicBlock::iterator E = CmpInstr.getParent()->end(); for (++I; I != E; ++I) { const MachineInstr &Instr = *I; bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI); @@ -5159,7 +5394,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // If EFLAGS is not killed nor re-defined, we should check whether it is // live-out. If it is live-out, do not optimize. if ((IsCmpZero || IsSwapped) && !IsSafe) { - MachineBasicBlock *MBB = CmpInstr->getParent(); + MachineBasicBlock *MBB = CmpInstr.getParent(); for (MachineBasicBlock *Successor : MBB->successors()) if (Successor->isLiveIn(X86::EFLAGS)) return false; @@ -5199,7 +5434,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, } assert(i != e && "Unable to locate a def EFLAGS operand"); - CmpInstr->eraseFromParent(); + CmpInstr.eraseFromParent(); // Modify the condition code of instructions in OpsToUpdate. for (auto &Op : OpsToUpdate) @@ -5211,14 +5446,14 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, /// operand at the use. We fold the load instructions if load defines a virtual /// register, the virtual register is used once in the same BB, and the /// instructions in-between do not load or store, and have no side effects. -MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, +MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, const MachineRegisterInfo *MRI, unsigned &FoldAsLoadDefReg, MachineInstr *&DefMI) const { if (FoldAsLoadDefReg == 0) return nullptr; // To be conservative, if there exists another load, clear the load candidate. - if (MI->mayLoad()) { + if (MI.mayLoad()) { FoldAsLoadDefReg = 0; return nullptr; } @@ -5233,8 +5468,8 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, // Collect information about virtual register operands of MI. unsigned SrcOperandId = 0; bool FoundSrcOperand = false; - for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); + for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI.getOperand(i); if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); @@ -5251,7 +5486,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, return nullptr; // Check whether we can fold the def into SrcOperandId. - if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI)) { + if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, *DefMI)) { FoldAsLoadDefReg = 0; return FoldMI; } @@ -5313,6 +5548,60 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, return true; } +bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const { + MachineBasicBlock &MBB = *MIB->getParent(); + DebugLoc DL = MIB->getDebugLoc(); + int64_t Imm = MIB->getOperand(1).getImm(); + assert(Imm != 0 && "Using push/pop for 0 is not efficient."); + MachineBasicBlock::iterator I = MIB.getInstr(); + + int StackAdjustment; + + if (Subtarget.is64Bit()) { + assert(MIB->getOpcode() == X86::MOV64ImmSExti8 || + MIB->getOpcode() == X86::MOV32ImmSExti8); + + // Can't use push/pop lowering if the function might write to the red zone. + X86MachineFunctionInfo *X86FI = + MBB.getParent()->getInfo<X86MachineFunctionInfo>(); + if (X86FI->getUsesRedZone()) { + MIB->setDesc(get(MIB->getOpcode() == X86::MOV32ImmSExti8 ? X86::MOV32ri + : X86::MOV64ri)); + return true; + } + + // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and + // widen the register if necessary. + StackAdjustment = 8; + BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm); + MIB->setDesc(get(X86::POP64r)); + MIB->getOperand(0) + .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64)); + } else { + assert(MIB->getOpcode() == X86::MOV32ImmSExti8); + StackAdjustment = 4; + BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm); + MIB->setDesc(get(X86::POP32r)); + } + + // Build CFI if necessary. + MachineFunction &MF = *MBB.getParent(); + const X86FrameLowering *TFL = Subtarget.getFrameLowering(); + bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool NeedsDwarfCFI = + !IsWin64Prologue && + (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry()); + bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI; + if (EmitCFI) { + TFL->BuildCFI(MBB, I, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment)); + TFL->BuildCFI(MBB, std::next(I), DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment)); + } + + return true; +} + // LoadStackGuard has so far only been implemented for 64-bit MachO. Different // code sequence is needed for other targets. static void expandLoadStackGuard(MachineInstrBuilder &MIB, @@ -5322,9 +5611,9 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB, unsigned Reg = MIB->getOperand(0).getReg(); const GlobalValue *GV = cast<GlobalValue>((*MIB->memoperands_begin())->getValue()); - unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; + auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( - MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 8, 8); + MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, 8); MachineBasicBlock::iterator I = MIB.getInstr(); BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1) @@ -5335,16 +5624,19 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB, MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0); } -bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { +bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { bool HasAVX = Subtarget.hasAVX(); - MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); - switch (MI->getOpcode()) { + MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI); + switch (MI.getOpcode()) { case X86::MOV32r0: return Expand2AddrUndef(MIB, get(X86::XOR32rr)); case X86::MOV32r1: return expandMOV32r1(MIB, *this, /*MinusOne=*/ false); case X86::MOV32r_1: return expandMOV32r1(MIB, *this, /*MinusOne=*/ true); + case X86::MOV32ImmSExti8: + case X86::MOV64ImmSExti8: + return ExpandMOVImmSExti8(MIB); case X86::SETB_C8r: return Expand2AddrUndef(MIB, get(X86::SBB8rr)); case X86::SETB_C16r: @@ -5360,17 +5652,30 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case X86::AVX_SET0: assert(HasAVX && "AVX not supported"); return Expand2AddrUndef(MIB, get(X86::VXORPSYrr)); + case X86::AVX512_128_SET0: + return Expand2AddrUndef(MIB, get(X86::VPXORDZ128rr)); + case X86::AVX512_256_SET0: + return Expand2AddrUndef(MIB, get(X86::VPXORDZ256rr)); case X86::AVX512_512_SET0: return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); case X86::V_SETALLONES: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); + case X86::AVX512_512_SETALLONES: { + unsigned Reg = MIB->getOperand(0).getReg(); + MIB->setDesc(get(X86::VPTERNLOGDZrri)); + // VPTERNLOGD needs 3 register inputs and an immediate. + // 0xff will return 1s for any input. + MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef).addImm(0xff); + return true; + } case X86::TEST8ri_NOREX: - MI->setDesc(get(X86::TEST8ri)); + MI.setDesc(get(X86::TEST8ri)); return true; case X86::MOV32ri64: - MI->setDesc(get(X86::MOV32ri)); + MI.setDesc(get(X86::MOV32ri)); return true; // KNL does not recognize dependency-breaking idioms for mask registers, @@ -5422,23 +5727,23 @@ static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs, static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, - MachineInstr *MI, + MachineInstr &MI, const TargetInstrInfo &TII) { // Create the base instruction with the memory operand as the first part. // Omit the implicit operands, something BuildMI can't do. - MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), - MI->getDebugLoc(), true); + MachineInstr *NewMI = + MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true); MachineInstrBuilder MIB(MF, NewMI); addOperands(MIB, MOs); // Loop over the rest of the ri operands, converting them over. - unsigned NumOps = MI->getDesc().getNumOperands()-2; + unsigned NumOps = MI.getDesc().getNumOperands() - 2; for (unsigned i = 0; i != NumOps; ++i) { - MachineOperand &MO = MI->getOperand(i+2); + MachineOperand &MO = MI.getOperand(i + 2); MIB.addOperand(MO); } - for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); + for (unsigned i = NumOps + 2, e = MI.getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI.getOperand(i); MIB.addOperand(MO); } @@ -5451,15 +5756,15 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, - MachineInstr *MI, const TargetInstrInfo &TII, + MachineInstr &MI, const TargetInstrInfo &TII, int PtrOffset = 0) { // Omit the implicit operands, something BuildMI can't do. - MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), - MI->getDebugLoc(), true); + MachineInstr *NewMI = + MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true); MachineInstrBuilder MIB(MF, NewMI); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + MachineOperand &MO = MI.getOperand(i); if (i == OpNo) { assert(MO.isReg() && "Expected to fold into reg operand!"); addOperands(MIB, MOs, PtrOffset); @@ -5477,35 +5782,35 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, - MachineInstr *MI) { + MachineInstr &MI) { MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt, - MI->getDebugLoc(), TII.get(Opcode)); + MI.getDebugLoc(), TII.get(Opcode)); addOperands(MIB, MOs); return MIB.addImm(0); } MachineInstr *X86InstrInfo::foldMemoryOperandCustom( - MachineFunction &MF, MachineInstr *MI, unsigned OpNum, + MachineFunction &MF, MachineInstr &MI, unsigned OpNum, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, unsigned Size, unsigned Align) const { - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { case X86::INSERTPSrr: case X86::VINSERTPSrr: // Attempt to convert the load of inserted vector into a fold load // of a single float. if (OpNum == 2) { - unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm(); + unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm(); unsigned ZMask = Imm & 15; unsigned DstIdx = (Imm >> 4) & 3; unsigned SrcIdx = (Imm >> 6) & 3; - unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize(); + unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize(); if (Size <= RCSize && 4 <= Align) { int PtrOffset = SrcIdx * 4; unsigned NewImm = (DstIdx << 4) | ZMask; unsigned NewOpCode = - (MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm - : X86::INSERTPSrm); + (MI.getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm + : X86::INSERTPSrm); MachineInstr *NewMI = FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset); NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm); @@ -5513,17 +5818,34 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( } } break; + case X86::MOVHLPSrr: + case X86::VMOVHLPSrr: + // Move the upper 64-bits of the second operand to the lower 64-bits. + // To fold the load, adjust the pointer to the upper and use (V)MOVLPS. + // TODO: In most cases AVX doesn't have a 8-byte alignment requirement. + if (OpNum == 2) { + unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize(); + if (Size <= RCSize && 8 <= Align) { + unsigned NewOpCode = + (MI.getOpcode() == X86::VMOVHLPSrr ? X86::VMOVLPSrm + : X86::MOVLPSrm); + MachineInstr *NewMI = + FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8); + return NewMI; + } + } + break; }; return nullptr; } MachineInstr *X86InstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr *MI, unsigned OpNum, + MachineFunction &MF, MachineInstr &MI, unsigned OpNum, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, unsigned Size, unsigned Align, bool AllowCommute) const { const DenseMap<unsigned, - std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; + std::pair<uint16_t, uint16_t> > *OpcodeTablePtr = nullptr; bool isCallRegIndirect = Subtarget.callRegIndirect(); bool isTwoAddrFold = false; @@ -5531,19 +5853,19 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // do not fold loads into calls or pushes, unless optimizing for size // aggressively. if (isCallRegIndirect && !MF.getFunction()->optForMinSize() && - (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r || - MI->getOpcode() == X86::PUSH16r || MI->getOpcode() == X86::PUSH32r || - MI->getOpcode() == X86::PUSH64r)) + (MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r || + MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r || + MI.getOpcode() == X86::PUSH64r)) return nullptr; - unsigned NumOps = MI->getDesc().getNumOperands(); - bool isTwoAddr = NumOps > 1 && - MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; + unsigned NumOps = MI.getDesc().getNumOperands(); + bool isTwoAddr = + NumOps > 1 && MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; // FIXME: AsmPrinter doesn't know how to handle // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. - if (MI->getOpcode() == X86::ADD32ri && - MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) + if (MI.getOpcode() == X86::ADD32ri && + MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) return nullptr; MachineInstr *NewMI = nullptr; @@ -5556,14 +5878,13 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. - if (isTwoAddr && NumOps >= 2 && OpNum < 2 && - MI->getOperand(0).isReg() && - MI->getOperand(1).isReg() && - MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) { + if (isTwoAddr && NumOps >= 2 && OpNum < 2 && MI.getOperand(0).isReg() && + MI.getOperand(1).isReg() && + MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) { OpcodeTablePtr = &RegOp2MemOpTable2Addr; isTwoAddrFold = true; } else if (OpNum == 0) { - if (MI->getOpcode() == X86::MOV32r0) { + if (MI.getOpcode() == X86::MOV32r0) { NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI); if (NewMI) return NewMI; @@ -5583,8 +5904,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // If table selected... if (OpcodeTablePtr) { // Find the Opcode to fuse - DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I = - OpcodeTablePtr->find(MI->getOpcode()); + auto I = OpcodeTablePtr->find(MI.getOpcode()); if (I != OpcodeTablePtr->end()) { unsigned Opcode = I->second.first; unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT; @@ -5592,7 +5912,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( return nullptr; bool NarrowToMOV32rm = false; if (Size) { - unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize(); + unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize(); if (Size < RCSize) { // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. @@ -5601,7 +5921,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // If this is a 64-bit load, but the spill slot is 32, then we can do // a 32-bit load which is implicitly zero-extended. This likely is // due to live interval analysis remat'ing a load from stack slot. - if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg()) + if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) return nullptr; Opcode = X86::MOV32rm; NarrowToMOV32rm = true; @@ -5632,14 +5952,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( if (AllowCommute) { unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex; if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { - bool HasDef = MI->getDesc().getNumDefs(); - unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0; - unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg(); - unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg(); + bool HasDef = MI.getDesc().getNumDefs(); + unsigned Reg0 = HasDef ? MI.getOperand(0).getReg() : 0; + unsigned Reg1 = MI.getOperand(CommuteOpIdx1).getReg(); + unsigned Reg2 = MI.getOperand(CommuteOpIdx2).getReg(); bool Tied1 = - 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); + 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); bool Tied2 = - 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO); + 0 == MI.getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO); // If either of the commutable operands are tied to the destination // then we can not commute + fold. @@ -5653,7 +5973,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // Unable to commute. return nullptr; } - if (CommutedMI != MI) { + if (CommutedMI != &MI) { // New instruction. We can't fold from this. CommutedMI->eraseFromParent(); return nullptr; @@ -5672,7 +5992,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // Unable to commute. return nullptr; } - if (UncommutedMI != MI) { + if (UncommutedMI != &MI) { // New instruction. It doesn't need to be kept. UncommutedMI->eraseFromParent(); return nullptr; @@ -5684,8 +6004,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( } // No fusion - if (PrintFailedFusing && !MI->isCopy()) - dbgs() << "We failed to fuse operand " << OpNum << " in " << *MI; + if (PrintFailedFusing && !MI.isCopy()) + dbgs() << "We failed to fuse operand " << OpNum << " in " << MI; return nullptr; } @@ -5723,6 +6043,10 @@ static bool hasPartialRegUpdate(unsigned Opcode) { case X86::CVTSS2SDrm: case X86::Int_CVTSS2SDrr: case X86::Int_CVTSS2SDrm: + case X86::MOVHPDrm: + case X86::MOVHPSrm: + case X86::MOVLPDrm: + case X86::MOVLPSrm: case X86::RCPSSr: case X86::RCPSSm: case X86::RCPSSr_Int: @@ -5753,27 +6077,27 @@ static bool hasPartialRegUpdate(unsigned Opcode) { /// Inform the ExeDepsFix pass how many idle /// instructions we would like before a partial register update. -unsigned X86InstrInfo:: -getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, - const TargetRegisterInfo *TRI) const { - if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode())) +unsigned X86InstrInfo::getPartialRegUpdateClearance( + const MachineInstr &MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const { + if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode())) return 0; // If MI is marked as reading Reg, the partial register update is wanted. - const MachineOperand &MO = MI->getOperand(0); + const MachineOperand &MO = MI.getOperand(0); unsigned Reg = MO.getReg(); if (TargetRegisterInfo::isVirtualRegister(Reg)) { - if (MO.readsReg() || MI->readsVirtualRegister(Reg)) + if (MO.readsReg() || MI.readsVirtualRegister(Reg)) return 0; } else { - if (MI->readsRegister(Reg, TRI)) + if (MI.readsRegister(Reg, TRI)) return 0; } - // If any of the preceding 16 instructions are reading Reg, insert a - // dependency breaking instruction. The magic number is based on a few - // Nehalem experiments. - return 16; + // If any instructions in the clearance range are reading Reg, insert a + // dependency breaking instruction, which is inexpensive and is likely to + // be hidden in other instruction's cycles. + return PartialRegUpdateClearance; } // Return true for any instruction the copies the high bits of the first source @@ -5847,59 +6171,61 @@ static bool hasUndefRegUpdate(unsigned Opcode) { /// /// Like getPartialRegUpdateClearance, this makes a strong assumption that the /// high bits that are passed-through are not live. -unsigned X86InstrInfo:: -getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum, - const TargetRegisterInfo *TRI) const { - if (!hasUndefRegUpdate(MI->getOpcode())) +unsigned +X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum, + const TargetRegisterInfo *TRI) const { + if (!hasUndefRegUpdate(MI.getOpcode())) return 0; // Set the OpNum parameter to the first source operand. OpNum = 1; - const MachineOperand &MO = MI->getOperand(OpNum); + const MachineOperand &MO = MI.getOperand(OpNum); if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { - // Use the same magic number as getPartialRegUpdateClearance. - return 16; + return UndefRegClearance; } return 0; } -void X86InstrInfo:: -breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, - const TargetRegisterInfo *TRI) const { - unsigned Reg = MI->getOperand(OpNum).getReg(); +void X86InstrInfo::breakPartialRegDependency( + MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { + unsigned Reg = MI.getOperand(OpNum).getReg(); // If MI kills this register, the false dependence is already broken. - if (MI->killsRegister(Reg, TRI)) + if (MI.killsRegister(Reg, TRI)) return; if (X86::VR128RegClass.contains(Reg)) { // These instructions are all floating point domain, so xorps is the best // choice. unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr; - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg) - .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); - MI->addRegisterKilled(Reg, TRI, true); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + MI.addRegisterKilled(Reg, TRI, true); } else if (X86::VR256RegClass.contains(Reg)) { // Use vxorps to clear the full ymm register. // It wants to read and write the xmm sub-register. unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm); - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg) - .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef) - .addReg(Reg, RegState::ImplicitDefine); - MI->addRegisterKilled(Reg, TRI, true); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg) + .addReg(XReg, RegState::Undef) + .addReg(XReg, RegState::Undef) + .addReg(Reg, RegState::ImplicitDefine); + MI.addRegisterKilled(Reg, TRI, true); } } -MachineInstr *X86InstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, - MachineBasicBlock::iterator InsertPt, int FrameIndex) const { +MachineInstr * +X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, + ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, + int FrameIndex, LiveIntervals *LIS) const { // Check switch flag if (NoFusing) return nullptr; // Unless optimizing for size, don't fold to avoid partial // register update stalls - if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode())) + if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode())) return nullptr; const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -5913,7 +6239,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; unsigned RCSize = 0; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: return nullptr; case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break; case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break; @@ -5925,8 +6251,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( if (Size < RCSize) return nullptr; // Change to CMPXXri r, 0 first. - MI->setDesc(get(NewOpc)); - MI->getOperand(1).ChangeToImmediate(0); + MI.setDesc(get(NewOpc)); + MI.getOperand(1).ChangeToImmediate(0); } else if (Ops.size() != 1) return nullptr; @@ -5957,15 +6283,16 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, unsigned RegSize = MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize(); - if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) { + if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm) && + RegSize > 4) { // These instructions only load 32 bits, we can't fold them if the // destination register is wider than 32 bits (4 bytes), and its user // instruction isn't scalar (SS). switch (UserOpc) { - case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: - case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: - case X86::MULSSrr_Int: case X86::VMULSSrr_Int: - case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: + case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int: + case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int: + case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int: + case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int: case X86::VFMADDSSr132r_Int: case X86::VFNMADDSSr132r_Int: case X86::VFMADDSSr213r_Int: case X86::VFNMADDSSr213r_Int: case X86::VFMADDSSr231r_Int: case X86::VFNMADDSSr231r_Int: @@ -5978,15 +6305,16 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, } } - if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) { + if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm) && + RegSize > 8) { // These instructions only load 64 bits, we can't fold them if the // destination register is wider than 64 bits (8 bytes), and its user // instruction isn't scalar (SD). switch (UserOpc) { - case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: - case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: - case X86::MULSDrr_Int: case X86::VMULSDrr_Int: - case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: + case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int: + case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int: + case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int: + case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int: case X86::VFMADDSDr132r_Int: case X86::VFNMADDSDr132r_Int: case X86::VFMADDSDr213r_Int: case X86::VFNMADDSDr213r_Int: case X86::VFMADDSDr231r_Int: case X86::VFNMADDSDr231r_Int: @@ -6003,36 +6331,43 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, } MachineInstr *X86InstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, - MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const { + MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, + LiveIntervals *LIS) const { // If loading from a FrameIndex, fold directly from the FrameIndex. - unsigned NumOps = LoadMI->getDesc().getNumOperands(); + unsigned NumOps = LoadMI.getDesc().getNumOperands(); int FrameIndex; if (isLoadFromStackSlot(LoadMI, FrameIndex)) { - if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF)) + if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF)) return nullptr; - return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex); + return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS); } // Check switch flag if (NoFusing) return nullptr; // Avoid partial register update stalls unless optimizing for size. - if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode())) + if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode())) return nullptr; // Determine the alignment of the load. unsigned Alignment = 0; - if (LoadMI->hasOneMemOperand()) - Alignment = (*LoadMI->memoperands_begin())->getAlignment(); + if (LoadMI.hasOneMemOperand()) + Alignment = (*LoadMI.memoperands_begin())->getAlignment(); else - switch (LoadMI->getOpcode()) { + switch (LoadMI.getOpcode()) { + case X86::AVX512_512_SET0: + case X86::AVX512_512_SETALLONES: + Alignment = 64; + break; case X86::AVX2_SETALLONES: case X86::AVX_SET0: + case X86::AVX512_256_SET0: Alignment = 32; break; case X86::V_SET0: case X86::V_SETALLONES: + case X86::AVX512_128_SET0: Alignment = 16; break; case X86::FsFLD0SD: @@ -6046,7 +6381,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( } if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: return nullptr; case X86::TEST8rr: NewOpc = X86::CMP8ri; break; case X86::TEST16rr: NewOpc = X86::CMP16ri8; break; @@ -6054,22 +6389,26 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::TEST64rr: NewOpc = X86::CMP64ri8; break; } // Change to CMPXXri r, 0 first. - MI->setDesc(get(NewOpc)); - MI->getOperand(1).ChangeToImmediate(0); + MI.setDesc(get(NewOpc)); + MI.getOperand(1).ChangeToImmediate(0); } else if (Ops.size() != 1) return nullptr; // Make sure the subregisters match. // Otherwise we risk changing the size of the load. - if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg()) + if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg()) return nullptr; SmallVector<MachineOperand,X86::AddrNumOperands> MOs; - switch (LoadMI->getOpcode()) { + switch (LoadMI.getOpcode()) { case X86::V_SET0: case X86::V_SETALLONES: case X86::AVX2_SETALLONES: case X86::AVX_SET0: + case X86::AVX512_128_SET0: + case X86::AVX512_256_SET0: + case X86::AVX512_512_SET0: + case X86::AVX512_512_SETALLONES: case X86::FsFLD0SD: case X86::FsFLD0SS: { // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. @@ -6082,7 +6421,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // x86-32 PIC requires a PIC base register for constant pools. unsigned PICBase = 0; - if (MF.getTarget().getRelocationModel() == Reloc::PIC_) { + if (MF.getTarget().isPositionIndependent()) { if (Subtarget.is64Bit()) PICBase = X86::RIP; else @@ -6096,17 +6435,21 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // Create a constant-pool entry. MachineConstantPool &MCP = *MF.getConstantPool(); Type *Ty; - unsigned Opc = LoadMI->getOpcode(); + unsigned Opc = LoadMI.getOpcode(); if (Opc == X86::FsFLD0SS) Ty = Type::getFloatTy(MF.getFunction()->getContext()); else if (Opc == X86::FsFLD0SD) Ty = Type::getDoubleTy(MF.getFunction()->getContext()); - else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0) + else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) + Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16); + else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 || + Opc == X86::AVX512_256_SET0) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8); else Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); - bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES); + bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES || + Opc == X86::AVX512_512_SETALLONES); const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : Constant::getNullValue(Ty); unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); @@ -6120,12 +6463,12 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( break; } default: { - if (isNonFoldablePartialRegisterLoad(*LoadMI, *MI, MF)) + if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF)) return nullptr; // Folding a normal load. Just copy the load's address operands. - MOs.append(LoadMI->operands_begin() + NumOps - X86::AddrNumOperands, - LoadMI->operands_begin() + NumOps); + MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands, + LoadMI.operands_begin() + NumOps); break; } } @@ -6133,11 +6476,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( /*Size=*/0, Alignment, /*AllowCommute=*/true); } -bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, - unsigned Reg, bool UnfoldLoad, bool UnfoldStore, - SmallVectorImpl<MachineInstr*> &NewMIs) const { - DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I = - MemOp2RegOpTable.find(MI->getOpcode()); +bool X86InstrInfo::unfoldMemoryOperand( + MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad, + bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const { + auto I = MemOp2RegOpTable.find(MI.getOpcode()); if (I == MemOp2RegOpTable.end()) return false; unsigned Opc = I->second.first; @@ -6154,8 +6496,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, const MCInstrDesc &MCID = get(Opc); const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); // TODO: Check if 32-byte or greater accesses are slow too? - if (!MI->hasOneMemOperand() && - RC == &X86::VR128RegClass && + if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass && Subtarget.isUnalignedMem16Slow()) // Without memoperands, loadRegFromAddr and storeRegToStackSlot will // conservatively assume the address is unaligned. That's bad for @@ -6165,8 +6506,8 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, SmallVector<MachineOperand,2> BeforeOps; SmallVector<MachineOperand,2> AfterOps; SmallVector<MachineOperand,4> ImpOps; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &Op = MI->getOperand(i); + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + MachineOperand &Op = MI.getOperand(i); if (i >= Index && i < Index + X86::AddrNumOperands) AddrOps.push_back(Op); else if (Op.isReg() && Op.isImplicit()) @@ -6179,10 +6520,8 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, // Emit the load instruction. if (UnfoldLoad) { - std::pair<MachineInstr::mmo_iterator, - MachineInstr::mmo_iterator> MMOs = - MF.extractLoadMemRefs(MI->memoperands_begin(), - MI->memoperands_end()); + std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator> MMOs = + MF.extractLoadMemRefs(MI.memoperands_begin(), MI.memoperands_end()); loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs); if (UnfoldStore) { // Address operands cannot be marked isKill. @@ -6195,7 +6534,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, } // Emit the data processing instruction. - MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI->getDebugLoc(), true); + MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true); MachineInstrBuilder MIB(MF, DataMI); if (FoldedStore) @@ -6248,10 +6587,8 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, // Emit the store instruction. if (UnfoldStore) { const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF); - std::pair<MachineInstr::mmo_iterator, - MachineInstr::mmo_iterator> MMOs = - MF.extractStoreMemRefs(MI->memoperands_begin(), - MI->memoperands_end()); + std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator> MMOs = + MF.extractStoreMemRefs(MI.memoperands_begin(), MI.memoperands_end()); storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs); } @@ -6264,8 +6601,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, if (!N->isMachineOpcode()) return false; - DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I = - MemOp2RegOpTable.find(N->getMachineOpcode()); + auto I = MemOp2RegOpTable.find(N->getMachineOpcode()); if (I == MemOp2RegOpTable.end()) return false; unsigned Opc = I->second.first; @@ -6371,8 +6707,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, unsigned *LoadRegIndex) const { - DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I = - MemOp2RegOpTable.find(Opc); + auto I = MemOp2RegOpTable.find(Opc); if (I == MemOp2RegOpTable.end()) return 0; bool FoldedLoad = I->second.second & TB_FOLDED_LOAD; @@ -6411,6 +6746,7 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: + case X86::MOVUPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: // AVX load instructions @@ -6421,13 +6757,52 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: + case X86::VMOVUPDrm: case X86::VMOVDQArm: case X86::VMOVDQUrm: case X86::VMOVAPSYrm: case X86::VMOVUPSYrm: case X86::VMOVAPDYrm: + case X86::VMOVUPDYrm: case X86::VMOVDQAYrm: case X86::VMOVDQUYrm: + // AVX512 load instructions + case X86::VMOVSSZrm: + case X86::VMOVSDZrm: + case X86::VMOVAPSZ128rm: + case X86::VMOVUPSZ128rm: + case X86::VMOVAPDZ128rm: + case X86::VMOVUPDZ128rm: + case X86::VMOVDQU8Z128rm: + case X86::VMOVDQU16Z128rm: + case X86::VMOVDQA32Z128rm: + case X86::VMOVDQU32Z128rm: + case X86::VMOVDQA64Z128rm: + case X86::VMOVDQU64Z128rm: + case X86::VMOVAPSZ256rm: + case X86::VMOVUPSZ256rm: + case X86::VMOVAPDZ256rm: + case X86::VMOVUPDZ256rm: + case X86::VMOVDQU8Z256rm: + case X86::VMOVDQU16Z256rm: + case X86::VMOVDQA32Z256rm: + case X86::VMOVDQU32Z256rm: + case X86::VMOVDQA64Z256rm: + case X86::VMOVDQU64Z256rm: + case X86::VMOVAPSZrm: + case X86::VMOVUPSZrm: + case X86::VMOVAPDZrm: + case X86::VMOVUPDZrm: + case X86::VMOVDQU8Zrm: + case X86::VMOVDQU16Zrm: + case X86::VMOVDQA32Zrm: + case X86::VMOVDQU32Zrm: + case X86::VMOVDQA64Zrm: + case X86::VMOVDQU64Zrm: + case X86::KMOVBkm: + case X86::KMOVWkm: + case X86::KMOVDkm: + case X86::KMOVQkm: break; } switch (Opc2) { @@ -6448,6 +6823,7 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: + case X86::MOVUPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: // AVX load instructions @@ -6458,13 +6834,52 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: + case X86::VMOVUPDrm: case X86::VMOVDQArm: case X86::VMOVDQUrm: case X86::VMOVAPSYrm: case X86::VMOVUPSYrm: case X86::VMOVAPDYrm: + case X86::VMOVUPDYrm: case X86::VMOVDQAYrm: case X86::VMOVDQUYrm: + // AVX512 load instructions + case X86::VMOVSSZrm: + case X86::VMOVSDZrm: + case X86::VMOVAPSZ128rm: + case X86::VMOVUPSZ128rm: + case X86::VMOVAPDZ128rm: + case X86::VMOVUPDZ128rm: + case X86::VMOVDQU8Z128rm: + case X86::VMOVDQU16Z128rm: + case X86::VMOVDQA32Z128rm: + case X86::VMOVDQU32Z128rm: + case X86::VMOVDQA64Z128rm: + case X86::VMOVDQU64Z128rm: + case X86::VMOVAPSZ256rm: + case X86::VMOVUPSZ256rm: + case X86::VMOVAPDZ256rm: + case X86::VMOVUPDZ256rm: + case X86::VMOVDQU8Z256rm: + case X86::VMOVDQU16Z256rm: + case X86::VMOVDQA32Z256rm: + case X86::VMOVDQU32Z256rm: + case X86::VMOVDQA64Z256rm: + case X86::VMOVDQU64Z256rm: + case X86::VMOVAPSZrm: + case X86::VMOVUPSZrm: + case X86::VMOVAPDZrm: + case X86::VMOVUPDZrm: + case X86::VMOVDQU8Zrm: + case X86::VMOVDQU16Zrm: + case X86::VMOVDQA32Zrm: + case X86::VMOVDQU32Zrm: + case X86::VMOVDQA64Zrm: + case X86::VMOVDQU64Zrm: + case X86::KMOVBkm: + case X86::KMOVWkm: + case X86::KMOVDkm: + case X86::KMOVQkm: break; } @@ -6540,8 +6955,8 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, return true; } -bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, - MachineInstr *Second) const { +bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr &First, + MachineInstr &Second) const { // Check if this processor supports macro-fusion. Since this is a minor // heuristic, we haven't specifically reserved a feature. hasAVX is a decent // proxy for SandyBridge+. @@ -6554,7 +6969,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, FuseInc } FuseKind; - switch(Second->getOpcode()) { + switch (Second.getOpcode()) { default: return false; case X86::JE_1: @@ -6580,7 +6995,7 @@ bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, FuseKind = FuseTest; break; } - switch (First->getOpcode()) { + switch (First.getOpcode()) { default: return false; case X86::TEST8rr: @@ -6703,8 +7118,6 @@ bool X86InstrInfo:: ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { assert(Cond.size() == 1 && "Invalid X86 branch condition!"); X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm()); - if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E) - return true; Cond[0].setImm(GetOppositeBranchCondition(CC)); return false; } @@ -6827,29 +7240,29 @@ static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { } std::pair<uint16_t, uint16_t> -X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const { - uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; +X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const { + uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; bool hasAVX2 = Subtarget.hasAVX2(); uint16_t validDomains = 0; - if (domain && lookup(MI->getOpcode(), domain)) + if (domain && lookup(MI.getOpcode(), domain)) validDomains = 0xe; - else if (domain && lookupAVX2(MI->getOpcode(), domain)) + else if (domain && lookupAVX2(MI.getOpcode(), domain)) validDomains = hasAVX2 ? 0xe : 0x6; return std::make_pair(domain, validDomains); } -void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { +void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { assert(Domain>0 && Domain<4 && "Invalid execution domain"); - uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; + uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; assert(dom && "Not an SSE instruction"); - const uint16_t *table = lookup(MI->getOpcode(), dom); + const uint16_t *table = lookup(MI.getOpcode(), dom); if (!table) { // try the other table assert((Subtarget.hasAVX2() || Domain < 3) && "256-bit vector operations only available in AVX2"); - table = lookupAVX2(MI->getOpcode(), dom); + table = lookupAVX2(MI.getOpcode(), dom); } assert(table && "Cannot change domain"); - MI->setDesc(get(table[Domain-1])); + MI.setDesc(get(table[Domain - 1])); } /// Return the noop instruction to use for a noop. @@ -6886,6 +7299,10 @@ unsigned X86InstrInfo::getJumpInstrTableEntryBound() const { bool X86InstrInfo::isHighLatencyDef(int opc) const { switch (opc) { default: return false; + case X86::DIVPDrm: + case X86::DIVPDrr: + case X86::DIVPSrm: + case X86::DIVPSrr: case X86::DIVSDrm: case X86::DIVSDrm_Int: case X86::DIVSDrr: @@ -6907,6 +7324,14 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const { case X86::SQRTSSr: case X86::SQRTSSr_Int: // AVX instructions with high latency + case X86::VDIVPDrm: + case X86::VDIVPDrr: + case X86::VDIVPDYrm: + case X86::VDIVPDYrr: + case X86::VDIVPSrm: + case X86::VDIVPSrr: + case X86::VDIVPSYrm: + case X86::VDIVPSYrr: case X86::VDIVSDrm: case X86::VDIVSDrm_Int: case X86::VDIVSDrr: @@ -6917,55 +7342,277 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const { case X86::VDIVSSrr_Int: case X86::VSQRTPDm: case X86::VSQRTPDr: + case X86::VSQRTPDYm: + case X86::VSQRTPDYr: case X86::VSQRTPSm: case X86::VSQRTPSr: + case X86::VSQRTPSYm: + case X86::VSQRTPSYr: case X86::VSQRTSDm: case X86::VSQRTSDm_Int: case X86::VSQRTSDr: + case X86::VSQRTSDr_Int: case X86::VSQRTSSm: case X86::VSQRTSSm_Int: case X86::VSQRTSSr: + case X86::VSQRTSSr_Int: + // AVX512 instructions with high latency + case X86::VDIVPDZ128rm: + case X86::VDIVPDZ128rmb: + case X86::VDIVPDZ128rmbk: + case X86::VDIVPDZ128rmbkz: + case X86::VDIVPDZ128rmk: + case X86::VDIVPDZ128rmkz: + case X86::VDIVPDZ128rr: + case X86::VDIVPDZ128rrk: + case X86::VDIVPDZ128rrkz: + case X86::VDIVPDZ256rm: + case X86::VDIVPDZ256rmb: + case X86::VDIVPDZ256rmbk: + case X86::VDIVPDZ256rmbkz: + case X86::VDIVPDZ256rmk: + case X86::VDIVPDZ256rmkz: + case X86::VDIVPDZ256rr: + case X86::VDIVPDZ256rrk: + case X86::VDIVPDZ256rrkz: + case X86::VDIVPDZrb: + case X86::VDIVPDZrbk: + case X86::VDIVPDZrbkz: + case X86::VDIVPDZrm: + case X86::VDIVPDZrmb: + case X86::VDIVPDZrmbk: + case X86::VDIVPDZrmbkz: + case X86::VDIVPDZrmk: + case X86::VDIVPDZrmkz: + case X86::VDIVPDZrr: + case X86::VDIVPDZrrk: + case X86::VDIVPDZrrkz: + case X86::VDIVPSZ128rm: + case X86::VDIVPSZ128rmb: + case X86::VDIVPSZ128rmbk: + case X86::VDIVPSZ128rmbkz: + case X86::VDIVPSZ128rmk: + case X86::VDIVPSZ128rmkz: + case X86::VDIVPSZ128rr: + case X86::VDIVPSZ128rrk: + case X86::VDIVPSZ128rrkz: + case X86::VDIVPSZ256rm: + case X86::VDIVPSZ256rmb: + case X86::VDIVPSZ256rmbk: + case X86::VDIVPSZ256rmbkz: + case X86::VDIVPSZ256rmk: + case X86::VDIVPSZ256rmkz: + case X86::VDIVPSZ256rr: + case X86::VDIVPSZ256rrk: + case X86::VDIVPSZ256rrkz: + case X86::VDIVPSZrb: + case X86::VDIVPSZrbk: + case X86::VDIVPSZrbkz: + case X86::VDIVPSZrm: + case X86::VDIVPSZrmb: + case X86::VDIVPSZrmbk: + case X86::VDIVPSZrmbkz: + case X86::VDIVPSZrmk: + case X86::VDIVPSZrmkz: + case X86::VDIVPSZrr: + case X86::VDIVPSZrrk: + case X86::VDIVPSZrrkz: + case X86::VDIVSDZrm: + case X86::VDIVSDZrr: + case X86::VDIVSDZrm_Int: + case X86::VDIVSDZrm_Intk: + case X86::VDIVSDZrm_Intkz: + case X86::VDIVSDZrr_Int: + case X86::VDIVSDZrr_Intk: + case X86::VDIVSDZrr_Intkz: + case X86::VDIVSDZrrb: + case X86::VDIVSDZrrbk: + case X86::VDIVSDZrrbkz: + case X86::VDIVSSZrm: + case X86::VDIVSSZrr: + case X86::VDIVSSZrm_Int: + case X86::VDIVSSZrm_Intk: + case X86::VDIVSSZrm_Intkz: + case X86::VDIVSSZrr_Int: + case X86::VDIVSSZrr_Intk: + case X86::VDIVSSZrr_Intkz: + case X86::VDIVSSZrrb: + case X86::VDIVSSZrrbk: + case X86::VDIVSSZrrbkz: + case X86::VSQRTPDZ128m: + case X86::VSQRTPDZ128mb: + case X86::VSQRTPDZ128mbk: + case X86::VSQRTPDZ128mbkz: + case X86::VSQRTPDZ128mk: + case X86::VSQRTPDZ128mkz: + case X86::VSQRTPDZ128r: + case X86::VSQRTPDZ128rk: + case X86::VSQRTPDZ128rkz: + case X86::VSQRTPDZ256m: + case X86::VSQRTPDZ256mb: + case X86::VSQRTPDZ256mbk: + case X86::VSQRTPDZ256mbkz: + case X86::VSQRTPDZ256mk: + case X86::VSQRTPDZ256mkz: + case X86::VSQRTPDZ256r: + case X86::VSQRTPDZ256rk: + case X86::VSQRTPDZ256rkz: case X86::VSQRTPDZm: + case X86::VSQRTPDZmb: + case X86::VSQRTPDZmbk: + case X86::VSQRTPDZmbkz: + case X86::VSQRTPDZmk: + case X86::VSQRTPDZmkz: case X86::VSQRTPDZr: + case X86::VSQRTPDZrb: + case X86::VSQRTPDZrbk: + case X86::VSQRTPDZrbkz: + case X86::VSQRTPDZrk: + case X86::VSQRTPDZrkz: + case X86::VSQRTPSZ128m: + case X86::VSQRTPSZ128mb: + case X86::VSQRTPSZ128mbk: + case X86::VSQRTPSZ128mbkz: + case X86::VSQRTPSZ128mk: + case X86::VSQRTPSZ128mkz: + case X86::VSQRTPSZ128r: + case X86::VSQRTPSZ128rk: + case X86::VSQRTPSZ128rkz: + case X86::VSQRTPSZ256m: + case X86::VSQRTPSZ256mb: + case X86::VSQRTPSZ256mbk: + case X86::VSQRTPSZ256mbkz: + case X86::VSQRTPSZ256mk: + case X86::VSQRTPSZ256mkz: + case X86::VSQRTPSZ256r: + case X86::VSQRTPSZ256rk: + case X86::VSQRTPSZ256rkz: case X86::VSQRTPSZm: + case X86::VSQRTPSZmb: + case X86::VSQRTPSZmbk: + case X86::VSQRTPSZmbkz: + case X86::VSQRTPSZmk: + case X86::VSQRTPSZmkz: case X86::VSQRTPSZr: + case X86::VSQRTPSZrb: + case X86::VSQRTPSZrbk: + case X86::VSQRTPSZrbkz: + case X86::VSQRTPSZrk: + case X86::VSQRTPSZrkz: case X86::VSQRTSDZm: case X86::VSQRTSDZm_Int: + case X86::VSQRTSDZm_Intk: + case X86::VSQRTSDZm_Intkz: case X86::VSQRTSDZr: + case X86::VSQRTSDZr_Int: + case X86::VSQRTSDZr_Intk: + case X86::VSQRTSDZr_Intkz: + case X86::VSQRTSDZrb_Int: + case X86::VSQRTSDZrb_Intk: + case X86::VSQRTSDZrb_Intkz: + case X86::VSQRTSSZm: case X86::VSQRTSSZm_Int: + case X86::VSQRTSSZm_Intk: + case X86::VSQRTSSZm_Intkz: case X86::VSQRTSSZr: - case X86::VSQRTSSZm: - case X86::VDIVSDZrm: - case X86::VDIVSDZrr: - case X86::VDIVSSZrm: - case X86::VDIVSSZrr: - - case X86::VGATHERQPSZrm: - case X86::VGATHERQPDZrm: + case X86::VSQRTSSZr_Int: + case X86::VSQRTSSZr_Intk: + case X86::VSQRTSSZr_Intkz: + case X86::VSQRTSSZrb_Int: + case X86::VSQRTSSZrb_Intk: + case X86::VSQRTSSZrb_Intkz: + + case X86::VGATHERDPDYrm: + case X86::VGATHERDPDZ128rm: + case X86::VGATHERDPDZ256rm: case X86::VGATHERDPDZrm: + case X86::VGATHERDPDrm: + case X86::VGATHERDPSYrm: + case X86::VGATHERDPSZ128rm: + case X86::VGATHERDPSZ256rm: case X86::VGATHERDPSZrm: - case X86::VPGATHERQDZrm: - case X86::VPGATHERQQZrm: + case X86::VGATHERDPSrm: + case X86::VGATHERPF0DPDm: + case X86::VGATHERPF0DPSm: + case X86::VGATHERPF0QPDm: + case X86::VGATHERPF0QPSm: + case X86::VGATHERPF1DPDm: + case X86::VGATHERPF1DPSm: + case X86::VGATHERPF1QPDm: + case X86::VGATHERPF1QPSm: + case X86::VGATHERQPDYrm: + case X86::VGATHERQPDZ128rm: + case X86::VGATHERQPDZ256rm: + case X86::VGATHERQPDZrm: + case X86::VGATHERQPDrm: + case X86::VGATHERQPSYrm: + case X86::VGATHERQPSZ128rm: + case X86::VGATHERQPSZ256rm: + case X86::VGATHERQPSZrm: + case X86::VGATHERQPSrm: + case X86::VPGATHERDDYrm: + case X86::VPGATHERDDZ128rm: + case X86::VPGATHERDDZ256rm: case X86::VPGATHERDDZrm: + case X86::VPGATHERDDrm: + case X86::VPGATHERDQYrm: + case X86::VPGATHERDQZ128rm: + case X86::VPGATHERDQZ256rm: case X86::VPGATHERDQZrm: - case X86::VSCATTERQPDZmr: - case X86::VSCATTERQPSZmr: + case X86::VPGATHERDQrm: + case X86::VPGATHERQDYrm: + case X86::VPGATHERQDZ128rm: + case X86::VPGATHERQDZ256rm: + case X86::VPGATHERQDZrm: + case X86::VPGATHERQDrm: + case X86::VPGATHERQQYrm: + case X86::VPGATHERQQZ128rm: + case X86::VPGATHERQQZ256rm: + case X86::VPGATHERQQZrm: + case X86::VPGATHERQQrm: + case X86::VSCATTERDPDZ128mr: + case X86::VSCATTERDPDZ256mr: case X86::VSCATTERDPDZmr: + case X86::VSCATTERDPSZ128mr: + case X86::VSCATTERDPSZ256mr: case X86::VSCATTERDPSZmr: - case X86::VPSCATTERQDZmr: - case X86::VPSCATTERQQZmr: + case X86::VSCATTERPF0DPDm: + case X86::VSCATTERPF0DPSm: + case X86::VSCATTERPF0QPDm: + case X86::VSCATTERPF0QPSm: + case X86::VSCATTERPF1DPDm: + case X86::VSCATTERPF1DPSm: + case X86::VSCATTERPF1QPDm: + case X86::VSCATTERPF1QPSm: + case X86::VSCATTERQPDZ128mr: + case X86::VSCATTERQPDZ256mr: + case X86::VSCATTERQPDZmr: + case X86::VSCATTERQPSZ128mr: + case X86::VSCATTERQPSZ256mr: + case X86::VSCATTERQPSZmr: + case X86::VPSCATTERDDZ128mr: + case X86::VPSCATTERDDZ256mr: case X86::VPSCATTERDDZmr: + case X86::VPSCATTERDQZ128mr: + case X86::VPSCATTERDQZ256mr: case X86::VPSCATTERDQZmr: + case X86::VPSCATTERQDZ128mr: + case X86::VPSCATTERQDZ256mr: + case X86::VPSCATTERQDZmr: + case X86::VPSCATTERQQZ128mr: + case X86::VPSCATTERQQZ256mr: + case X86::VPSCATTERQQZmr: return true; } } -bool X86InstrInfo:: -hasHighOperandLatency(const TargetSchedModel &SchedModel, - const MachineRegisterInfo *MRI, - const MachineInstr *DefMI, unsigned DefIdx, - const MachineInstr *UseMI, unsigned UseIdx) const { - return isHighLatencyDef(DefMI->getOpcode()); +bool X86InstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel, + const MachineRegisterInfo *MRI, + const MachineInstr &DefMI, + unsigned DefIdx, + const MachineInstr &UseMI, + unsigned UseIdx) const { + return isHighLatencyDef(DefMI.getOpcode()); } bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst, @@ -7014,12 +7661,119 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case X86::PANDrr: case X86::PORrr: case X86::PXORrr: + case X86::ANDPDrr: + case X86::ANDPSrr: + case X86::ORPDrr: + case X86::ORPSrr: + case X86::XORPDrr: + case X86::XORPSrr: + case X86::PADDBrr: + case X86::PADDWrr: + case X86::PADDDrr: + case X86::PADDQrr: case X86::VPANDrr: case X86::VPANDYrr: + case X86::VPANDDZ128rr: + case X86::VPANDDZ256rr: + case X86::VPANDDZrr: + case X86::VPANDQZ128rr: + case X86::VPANDQZ256rr: + case X86::VPANDQZrr: case X86::VPORrr: case X86::VPORYrr: + case X86::VPORDZ128rr: + case X86::VPORDZ256rr: + case X86::VPORDZrr: + case X86::VPORQZ128rr: + case X86::VPORQZ256rr: + case X86::VPORQZrr: case X86::VPXORrr: case X86::VPXORYrr: + case X86::VPXORDZ128rr: + case X86::VPXORDZ256rr: + case X86::VPXORDZrr: + case X86::VPXORQZ128rr: + case X86::VPXORQZ256rr: + case X86::VPXORQZrr: + case X86::VANDPDrr: + case X86::VANDPSrr: + case X86::VANDPDYrr: + case X86::VANDPSYrr: + case X86::VANDPDZ128rr: + case X86::VANDPSZ128rr: + case X86::VANDPDZ256rr: + case X86::VANDPSZ256rr: + case X86::VANDPDZrr: + case X86::VANDPSZrr: + case X86::VORPDrr: + case X86::VORPSrr: + case X86::VORPDYrr: + case X86::VORPSYrr: + case X86::VORPDZ128rr: + case X86::VORPSZ128rr: + case X86::VORPDZ256rr: + case X86::VORPSZ256rr: + case X86::VORPDZrr: + case X86::VORPSZrr: + case X86::VXORPDrr: + case X86::VXORPSrr: + case X86::VXORPDYrr: + case X86::VXORPSYrr: + case X86::VXORPDZ128rr: + case X86::VXORPSZ128rr: + case X86::VXORPDZ256rr: + case X86::VXORPSZ256rr: + case X86::VXORPDZrr: + case X86::VXORPSZrr: + case X86::KADDBrr: + case X86::KADDWrr: + case X86::KADDDrr: + case X86::KADDQrr: + case X86::KANDBrr: + case X86::KANDWrr: + case X86::KANDDrr: + case X86::KANDQrr: + case X86::KORBrr: + case X86::KORWrr: + case X86::KORDrr: + case X86::KORQrr: + case X86::KXORBrr: + case X86::KXORWrr: + case X86::KXORDrr: + case X86::KXORQrr: + case X86::VPADDBrr: + case X86::VPADDWrr: + case X86::VPADDDrr: + case X86::VPADDQrr: + case X86::VPADDBYrr: + case X86::VPADDWYrr: + case X86::VPADDDYrr: + case X86::VPADDQYrr: + case X86::VPADDBZ128rr: + case X86::VPADDWZ128rr: + case X86::VPADDDZ128rr: + case X86::VPADDQZ128rr: + case X86::VPADDBZ256rr: + case X86::VPADDWZ256rr: + case X86::VPADDDZ256rr: + case X86::VPADDQZ256rr: + case X86::VPADDBZrr: + case X86::VPADDWZrr: + case X86::VPADDDZrr: + case X86::VPADDQZrr: + case X86::VPMULLWrr: + case X86::VPMULLWYrr: + case X86::VPMULLWZ128rr: + case X86::VPMULLWZ256rr: + case X86::VPMULLWZrr: + case X86::VPMULLDrr: + case X86::VPMULLDYrr: + case X86::VPMULLDZ128rr: + case X86::VPMULLDZ256rr: + case X86::VPMULLDZrr: + case X86::VPMULLQZ128rr: + case X86::VPMULLQZ256rr: + case X86::VPMULLQZrr: // Normal min/max instructions are not commutative because of NaN and signed // zero semantics, but these are. Thus, there's no need to check for global // relaxed math; the instructions themselves have the properties we need. @@ -7035,14 +7789,30 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case X86::VMAXCPSrr: case X86::VMAXCPDYrr: case X86::VMAXCPSYrr: + case X86::VMAXCPDZ128rr: + case X86::VMAXCPSZ128rr: + case X86::VMAXCPDZ256rr: + case X86::VMAXCPSZ256rr: + case X86::VMAXCPDZrr: + case X86::VMAXCPSZrr: case X86::VMAXCSDrr: case X86::VMAXCSSrr: + case X86::VMAXCSDZrr: + case X86::VMAXCSSZrr: case X86::VMINCPDrr: case X86::VMINCPSrr: case X86::VMINCPDYrr: case X86::VMINCPSYrr: + case X86::VMINCPDZ128rr: + case X86::VMINCPSZ128rr: + case X86::VMINCPDZ256rr: + case X86::VMINCPSZ256rr: + case X86::VMINCPDZrr: + case X86::VMINCPSZrr: case X86::VMINCSDrr: case X86::VMINCSSrr: + case X86::VMINCSDZrr: + case X86::VMINCSSZrr: return true; case X86::ADDPDrr: case X86::ADDPSrr: @@ -7056,14 +7826,30 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { case X86::VADDPSrr: case X86::VADDPDYrr: case X86::VADDPSYrr: + case X86::VADDPDZ128rr: + case X86::VADDPSZ128rr: + case X86::VADDPDZ256rr: + case X86::VADDPSZ256rr: + case X86::VADDPDZrr: + case X86::VADDPSZrr: case X86::VADDSDrr: case X86::VADDSSrr: + case X86::VADDSDZrr: + case X86::VADDSSZrr: case X86::VMULPDrr: case X86::VMULPSrr: case X86::VMULPDYrr: case X86::VMULPSYrr: + case X86::VMULPDZ128rr: + case X86::VMULPSZ128rr: + case X86::VMULPDZ256rr: + case X86::VMULPSZ256rr: + case X86::VMULPDZrr: + case X86::VMULPSZrr: case X86::VMULSDrr: case X86::VMULSSrr: + case X86::VMULSDZrr: + case X86::VMULSSZrr: return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; default: return false; @@ -7135,10 +7921,8 @@ X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { {MO_NTPOFF, "x86-ntpoff"}, {MO_GOTNTPOFF, "x86-gotntpoff"}, {MO_DLLIMPORT, "x86-dllimport"}, - {MO_DARWIN_STUB, "x86-darwin-stub"}, {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"}, {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"}, - {MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, "x86-darwin-hidden-nonlazy-pic-base"}, {MO_TLVP, "x86-tlvp"}, {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"}, {MO_SECREL, "x86-secrel"}}; @@ -7163,7 +7947,7 @@ namespace { return false; // Only emit a global base reg in PIC mode. - if (TM->getRelocationModel() != Reloc::PIC_) + if (!TM->isPositionIndependent()) return false; X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); @@ -7223,7 +8007,10 @@ namespace { LDTLSCleanup() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { - X86MachineFunctionInfo* MFI = MF.getInfo<X86MachineFunctionInfo>(); + if (skipFunction(*MF.getFunction())) + return false; + + X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); if (MFI->getNumLocalDynamicTLSAccesses() < 2) { // No point folding accesses if there isn't at least two. return false; @@ -7249,9 +8036,9 @@ namespace { case X86::TLS_base_addr32: case X86::TLS_base_addr64: if (TLSBaseAddrReg) - I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg); + I = ReplaceTLSBaseAddrCall(*I, TLSBaseAddrReg); else - I = SetRegister(I, &TLSBaseAddrReg); + I = SetRegister(*I, &TLSBaseAddrReg); Changed = true; break; default: @@ -7270,29 +8057,29 @@ namespace { // Replace the TLS_base_addr instruction I with a copy from // TLSBaseAddrReg, returning the new instruction. - MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I, + MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr &I, unsigned TLSBaseAddrReg) { - MachineFunction *MF = I->getParent()->getParent(); + MachineFunction *MF = I.getParent()->getParent(); const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>(); const bool is64Bit = STI.is64Bit(); const X86InstrInfo *TII = STI.getInstrInfo(); // Insert a Copy from TLSBaseAddrReg to RAX/EAX. - MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(), - TII->get(TargetOpcode::COPY), - is64Bit ? X86::RAX : X86::EAX) - .addReg(TLSBaseAddrReg); + MachineInstr *Copy = + BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX) + .addReg(TLSBaseAddrReg); // Erase the TLS_base_addr instruction. - I->eraseFromParent(); + I.eraseFromParent(); return Copy; } // Create a virtal register in *TLSBaseAddrReg, and populate it by // inserting a copy instruction after I. Returns the new instruction. - MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) { - MachineFunction *MF = I->getParent()->getParent(); + MachineInstr *SetRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) { + MachineFunction *MF = I.getParent()->getParent(); const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>(); const bool is64Bit = STI.is64Bit(); const X86InstrInfo *TII = STI.getInstrInfo(); @@ -7304,11 +8091,11 @@ namespace { : &X86::GR32RegClass); // Insert a copy from RAX/EAX to TLSBaseAddrReg. - MachineInstr *Next = I->getNextNode(); - MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(), - TII->get(TargetOpcode::COPY), - *TLSBaseAddrReg) - .addReg(is64Bit ? X86::RAX : X86::EAX); + MachineInstr *Next = I.getNextNode(); + MachineInstr *Copy = + BuildMI(*I.getParent(), Next, I.getDebugLoc(), + TII->get(TargetOpcode::COPY), *TLSBaseAddrReg) + .addReg(is64Bit ? X86::RAX : X86::EAX); return Copy; } |